def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False): """Initializes a ShoppingSizeDetector object Args: entity_name: A string by which the detected numbers would be replaced with on calling detect_entity() source_language_script: ISO 639 code for language of entities to be detected by the instance of this class translation_enabled: True if messages needs to be translated in case detector does not support a particular language, else False """ # assigning values to superclass attributes self._supported_languages = [ENGLISH_LANG] super(ShoppingSizeDetector, self).__init__(source_language_script, translation_enabled) self.entity_name = entity_name self.text = '' self.text_dict = {} self.tagged_text = '' self.processed_text = '' self.size = [] self.original_size_text = [] self.text_detection_object = TextDetector(entity_name=self.entity_name) self.tag = '__' + self.entity_name + '__'
def get_location(message, entity_name, structured_value, fallback_value, bot_message): """This functionality calls the TextDetector class to detect location TODO: We can improve this by creating separate class for location detection instead of using TextDetector Attributes: NOTE: Explained above Output: NOTE: Explained above """ text_detection = TextDetector(entity_name=entity_name) if structured_value: text_entity_list, original_text_list = text_detection.detect_entity(structured_value) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED) else: return output_entity_dict_value(structured_value, structured_value, FROM_STRUCTURE_VALUE_NOT_VERIFIED) else: text_entity_list, original_text_list = text_detection.detect_entity(message) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE) elif fallback_value: return output_entity_dict_value(fallback_value, fallback_value, FROM_FALLBACK_VALUE) return None
def _detect_text_budget(self, budget_list=None, original_list=None): """Detects budget from text using text detection logic i.e.TextDetector This is a function which will be called when we want to detect the budget using text Returns: A tuple of two lists with first list containing the detected numbers and second list containing their corresponding substrings in the original message. """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] text_detection_object = TextDetector(entity_name=self.entity_name) budget_text_list, original_text_list = text_detection_object.detect_entity( self.text, return_str=True) # FIXME: Broken/Ineffective code. self.tagged_text = text_detection_object.tagged_text self.processed_text = text_detection_object.processed_text for _, original_text in zip(budget_text_list, original_text_list): budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_TEXT } budget_list.append(budget) original_list.append(original_text) return budget_list, original_list
def __init__(self, entity_name): self.text = '' self.text_dict = {} self.tagged_text = '' self.processed_text = '' self.location = [] self.original_location_text = [] self.text_detection_object = TextDetector(entity_name=entity_name) self.user_address = None self.user_lat_long = None self.user_location_updated_at = None
def __init__(self, entity_name): """ Initializes a NameDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() """ self.entity_name = entity_name self.text = '' self.names = [] self.tagged_text = '' self.processed_text = '' self.original_name_text = [] self.text_detection_object = TextDetector(entity_name=entity_name)
def __init__(self, entity_name): """Initializes a ShoppingSizeDetector object Args: entity_name: A string by which the detected numbers would be replaced with on calling detect_entity() """ self.entity_name = entity_name self.dictionary_name = 'shopping_size' self.text = '' self.text_dict = {} self.tagged_text = '' self.processed_text = '' self.size = [] self.original_size_text = [] self.text_detection_object = TextDetector(entity_name=dictionary_name) self.tag = '__' + self.entity_name + '__'
def __init__(self, entity_name): """ Initializes a CityDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() """ self.entity_name = entity_name self.text = '' self.bot_message = '' self.tagged_text = '' self.processed_text = '' self.city = [] self.text_detection_object = TextDetector(entity_name=entity_name) self.tag = '__' + self.entity_name + '__'
def text(request): """This functionality initializes text detection functionality to detect textual entities. Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) fuzziness = parameters_dict[PARAMETER_FUZZINESS] min_token_len_fuzziness = parameters_dict[PARAMETER_MIN_TOKEN_LEN_FUZZINESS] text_detector = TextDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], source_language_script=parameters_dict[PARAMETER_LANGUAGE_SCRIPT]) ner_logger.debug('fuzziness: %s min_token_len_fuzziness %s' % (str(fuzziness), str(min_token_len_fuzziness))) if fuzziness: fuzziness = parse_fuzziness_parameter(fuzziness) text_detector.set_fuzziness_threshold(fuzziness) if min_token_len_fuzziness: min_token_len_fuzziness = int(min_token_len_fuzziness) text_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) entity_output = text_detector.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def __init__(self, entity_name, language=lang_constant.ENGLISH_LANG): """ Initializes a CityDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() language: language code of text """ self.entity_name = entity_name self.text = '' self.bot_message = '' self.tagged_text = '' self.processed_text = '' self.city = [] self.text_detection_object = TextDetector(entity_name=entity_name, source_language_script=language) self.tag = '__' + self.entity_name + '__'
def __init__(self, entity_name): """ Initializes the CityAdvanceDetector object with given entity_name Args: entity_name: A string by which the detected date entity substrings would be replaced with on calling detect_entity() """ self.text = '' self.tagged_text = '' self.processed_text = '' self.city = [] self.original_city_text = [] self.entity_name = entity_name self.text_detection_object = TextDetector(entity_name=entity_name) self.bot_message = None self.tag = '__' + entity_name + '__'
def text(request): """This functionality initializes text detection functionality to detect textual entities. Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) text_detector = TextDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], source_language_script=parameters_dict[PARAMETER_LANGUAGE_SCRIPT]) entity_output = text_detector.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError, e: ner_logger.debug('Exception for text_synonym: %s ' % e) return HttpResponse(status=400)
def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False): """Initializes a BudgetDetector object Args: entity_name: A string by which the detected budget would be replaced with on calling detect_entity() """ # assigning values to superclass attributes self._supported_languages = [ENGLISH_LANG] super(BudgetDetector, self).__init__(source_language_script, translation_enabled) self.min_digit = 2 self.max_digit = 5 self.entity_name = entity_name self.text = '' self.tagged_text = '' self.processed_text = '' self.budget = [] self.original_budget_text = [] self.unit_present_list = ['k', 'l', 'm', 'c', 'h', 'th'] regx_for_units = [(r'([\d,.]+)\s*k', 1000), (r'([\d,.]+)\s*h', 1000), (r'([\d,.]+)\s*th', 1000), (r'([\d,.]+)\s*l', 100000), (r'([\d,.]+)\s*lacs?', 100000), (r'([\d,.]+)\s*lakh?', 100000), (r'([\d,.]+)\s*lakhs?', 100000), (r'([\d,.]+)\s*m', 1000000), (r'([\d,.]+)\s*million', 1000000), (r'([\d,.]+)\s*mill?', 1000000), (r'([\d,.]+)\s*c', 10000000), (r'([\d,.]+)\s*cro?', 10000000), (r'([\d,.]+)\s*crore?', 10000000), (r'([\d,.]+)\s*crores?', 10000000)] self.regex_object = RegexReplace(regx_for_units) self.tag = '__' + self.entity_name + '__' self.text_detection_object = TextDetector(entity_name=entity_name)
def __init__(self, entity_name): """Initializes a BudgetDetector object Args: entity_name: A string by which the detected budget would be replaced with on calling detect_entity() """ self.min_digit = 2 self.max_digit = 5 self.entity_name = entity_name self.text = '' self.tagged_text = '' self.processed_text = '' self.budget = [] self.original_budget_text = [] regex_for_thousand = [(r'(\d+)k', r'\g<1>000')] self.regex_object = Regex(regex_for_thousand) self.tag = '__' + self.entity_name + '__' self.text_detection_object = TextDetector(entity_name=ES_BUDGET_LIST)
def get_location(message, entity_name, structured_value, fallback_value, bot_message): """"Use TextDetector (elasticsearch) to detect location TODO: We can improve this by creating separate for location detection instead of using TextDetector Args: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity """ text_detection = TextDetector(entity_name=entity_name) if structured_value: text_entity_list, original_text_list = text_detection.detect_entity(structured_value) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED) else: return output_entity_dict_list([structured_value], [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED) else: text_entity_list, original_text_list = text_detection.detect_entity(message) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE) elif fallback_value: return output_entity_dict_list([fallback_value], [fallback_value], FROM_FALLBACK_VALUE) return None
class CityDetector(object): """ CityDetector detects city from the text it similar to TextDetection and inherits TextDetection to perform its operation. Attributes: text: string to extract entities from entity_name: string by which the detected city entities would be replaced with on calling detect_entity() text_dict: dictionary to store lemmas, stems, ngrams used during detection process tagged_text: string with city entities replaced with tag defined by entity_name text_entity: list to store detected entities from the text original_city_entity: list of substrings of the text detected as entities processed_text: string with detected time entities removed tag: entity_name prepended and appended with '__' """ def __init__(self, entity_name): """ Initializes a CityDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() """ self.entity_name = entity_name self.text = '' self.bot_message = '' self.text_dict = {} self.tagged_text = '' self.processed_text = '' self.city = [] self.original_city_text = [] self.text_detection_object = TextDetector(entity_name=entity_name) self.tag = '__' + self.entity_name + '__' def detect_city(self): """ Takes a message and writtens the list of city present in the text :return: tuple (list of location , original text) """ city_list = [] original_list = [] city_list, original_list = self.detect_city_format( city_list, original_list) self.update_processed_text(original_list) return city_list, original_list def detect_entity(self, text, run_model=True): """Detects city in the text string Args: text: string to extract entities from run_model: Boolean True if model needs to run else False Returns: A tuple of two lists with first list containing the detected city and second list containing their corresponding substrings in the given text. For example: (['Mumbai'], ['bombay']) Additionally this function assigns these lists to self.city and self.original_city_text attributes respectively. """ self.text = ' ' + text + ' ' self.text = self.text.lower() self.processed_text = self.text.lower() self.tagged_text = self.text.lower() city_data = [] if run_model: city_data = self.city_model_detection() if not run_model or not city_data[0]: city_data = self.detect_city() city_data = city_data + ([], ) self.city = city_data[0] self.original_city_text = city_data[1] return city_data def detect_city_format(self, city_list=[], original_list=[]): """ Detects city from self.text conforming to formats defined by regex pattern. Args: city_list: Optional, list to store detected cities original_list: Optional, list to store corresponding substrings of given text which were detected as cities Returns: A tuple of two lists with first list containing the detected cities and second list containing their corresponding substrings in the given text. For example: For example: (['Mumbai'], ['bombay']) """ city_list_from_text_entity, original_list = self.text_detection_object.detect_entity( self.text) self.tagged_text = self.text_detection_object.tagged_text self.processed_text = self.text_detection_object.processed_text for city in city_list_from_text_entity: city_list.append(city) return city_list, original_list def city_model_detection(self): """ This function calls get_model_output() method of PredictCRF class and verifies the values returned by it. If the cities provided by crf are present in the datastore, it sets the value MODEL_VERIFIED else MODEL_NOT_VERFIED is set. And returns the final list of all detected items with each value containing a field to show whether the value if verified or not For Example: Note*: before calling this method you need to call set_bot_message() to set a bot message. self.bot_message = 'Please help me with your departure city?' self.text = 'mummbai' final values of all lists: model_output = [{'city':'mummbai', 'from': 1, 'to': 0, 'via': 0}] The for loop verifies each city in model_output list by checking whether it exists in datastore or not(by running elastic search). If not then sets the value MODEL_NOT_VERIFIED else MODEL_VERIFIED finally it returns ['Mumbai'], ['mummbai'], [MODEL_VERIFIED] For Example: self.bot_message = 'Please help me with your departure city?' self.text = 'dehradun' final values of all lists: model_output = [{'city':'dehradun', 'from': 1, 'to': 0, 'via': 0}] Note*: Dehradun is not present in out datastore so it will take original value as entity value. finally it returns ['dehradun'], ['dehradun'], [MODEL_NOT_VERIFIED] """ predict_crf = PredictCRF() model_output = predict_crf.get_model_output( entity_type=CITY_ENTITY_TYPE, bot_message=self.bot_message, user_message=self.text) city_list, original_list, model_detection_type = [], [], [] for city_dict in model_output: city_list_from_text_entity, original_list_from_text_entity = \ self.text_detection_object.detect_entity(city_dict[CITY_VALUE]) if city_list_from_text_entity: city_list.extend(city_list_from_text_entity) original_list.extend(original_list_from_text_entity) model_detection_type.append(MODEL_VERIFIED) else: city_list.append(city_dict[CITY_VALUE]) original_list.append(city_dict[CITY_VALUE]) model_detection_type.append(MODEL_NOT_VERIFIED) self.update_processed_text(original_list) return city_list, original_list, model_detection_type def update_processed_text(self, original_list): """ Replaces detected cities with tag generated from entity_name used to initialize the object with A final string with all cities replaced will be stored in object's tagged_text attribute A string with all cities removed will be stored in object's processed_text attribute Args: original_city_strings: list of substrings of original text to be replaced with tag created from entity_name """ for detected_text in original_list: self.tagged_text = self.tagged_text.replace( detected_text, self.tag) self.processed_text = self.processed_text.replace( detected_text, '') def set_bot_message(self, bot_message): """ Sets the object's bot_message attribute Args: bot_message: string """ self.bot_message = bot_message
class CityDetector(object): """ CityDetector detects cities from the text. It Detects city with the properties like "from", "to", "via" and "normal". These cities are returned in a dictionary form that contains relevant text, its actual value and its attribute in boolean field i.e. "from", "to", "via", "normal". This class uses TextDetector to detect the entity values. It also has model integrated to it that can be used to extract relevant text from the text Attributes: text: string to extract entities from entity_name: string by which the detected city entities would be replaced with on calling detect_entity() tagged_text: string with city entities replaced with tag defined by entity_name city: list to store detected entities from the text processed_text: string with detected time entities removed tag: entity_name prepended and appended with '__' """ def __init__(self, entity_name, language): """ Initializes a CityDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() language: language code of text """ self.entity_name = entity_name self.text = '' self.bot_message = '' self.tagged_text = '' self.processed_text = '' self.city = [] self.text_detection_object = TextDetector( entity_name=entity_name, source_language_script=language) self.tag = '__' + self.entity_name + '__' def detect_entity(self, text, run_model=False): """Detects city in the text string Args: text: string to extract entities from run_model: True if model needs to be run else False Returns: It returns the list of dictionary containing the fields like detection_method, from, normal, to, text, value, via For example: [ { 'detection_method': 'message', 'from': False, 'normal': True, 'text': 'mumbai', 'to': False, 'value': u'BOM', 'via': False } ] Additionally this function assigns this list to self.city """ self.text = ' ' + text + ' ' self.text = self.text.lower() self.processed_text = self.text self.tagged_text = self.text city_data = [] if run_model: city_data = self._city_model_detection() if not run_model or not city_data: city_data = self._detect_city() self.city = city_data return city_data def _detect_city(self): """ Detects a city and categorises it into "from", "to", "via" and "normal" attributes Returns: It returns the list of dictionary containing the fields like detection_method, from, normal, to, text, value, via """ # print 'detection for default task' final_city_dict_list = [] city_dict_list = self._detect_departure_arrival_city_prepositions() final_city_dict_list.extend(city_dict_list) self._update_processed_text(city_dict_list) city_dict_list = self._detect_departure_arrival_city() final_city_dict_list.extend(city_dict_list) self._update_processed_text(city_dict_list) city_dict_list = self._detect_arrival_departure_city() final_city_dict_list.extend(city_dict_list) self._update_processed_text(city_dict_list) city_dict_list = self._detect_departure_city() final_city_dict_list.extend(city_dict_list) self._update_processed_text(city_dict_list) city_dict_list = self._detect_arrival_city() final_city_dict_list.extend(city_dict_list) self._update_processed_text(city_dict_list) city_dict_list = self._detect_any_city() final_city_dict_list.extend(city_dict_list) self._update_processed_text(city_dict_list) return final_city_dict_list def _detect_city_format(self): """ """ return self._city_dict_from_text(text=self.processed_text, normal_property=True) def _detect_departure_arrival_city(self): """ Finds <any text><space(s)><'-' or 'to' or '2'><space(s)><any text> in the given text. It splits the text into two parts on '-' or 'to' or '2' and detects the departure city in the first (left) part and detects arrival city in the second (right) part Args: None Returns: The list of dictionary containing the dictionary for city which is detected as departure_city and city that got detected as arrival_city. For departure city the key "from" will be set to True. Whereas for arrival city the key "to" will be set to True. """ city_dict_list = [] patterns = re.findall( ur'\s(([A-Za-z\u0900-\u097F]+)\s+(\-|to|2|se|से|and)\s+([A-Za-z\u0900-\u097F\s]+))\.?', self.processed_text.lower(), re.UNICODE) for pattern in patterns: city_dict_list.extend( self._city_dict_from_text(text=pattern[1], from_property=True)) city_dict_list.extend( self._city_dict_from_text(text=pattern[3], to_property=True)) return city_dict_list def _detect_departure_arrival_city_prepositions(self): """ Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given text. It splits the text into two parts on '-' or 'to' or '2' and detects the departure city in the first (left) part and detects arrival city in the second (right) part Args: None Returns: The list of dictionary containing the dictionary for city which is detected as departure_city and city that got detected as arrival_city. For departure city the key "from" will be set to True. Whereas for arrival city the key "to" will be set to True. """ city_dict_list = [] patterns = re.findall( ur'\s((?:from|frm|departing|depart|leaving|leave)\s+([A-Za-z\u0900-\u097F]+)' ur'\s+(?:and|to|se|से|2|for|fr|arriving|arrive|reaching|reach|rch)' ur'\s+([A-Za-z\u0900-\u097F]+))\.?', self.processed_text.lower(), re.UNICODE) for pattern in patterns: city_dict_list.extend( self._city_dict_from_text(text=pattern[1], from_property=True)) city_dict_list.extend( self._city_dict_from_text(text=pattern[2], to_property=True)) return city_dict_list def _detect_arrival_departure_city(self): """ Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given text. It splits the text into two parts on '-' or 'to' or '2' and detects the arrival city in the first (left) part and detects departure city in the second (right) part Args: None Returns: The list of dictionary containing the dictionary for city which is detected as departure_city and city that got detected as arrival_city. For departure city the key "from" will be set to True. Whereas for arrival city the key "to" will be set to True. """ city_dict_list = [] patterns = re.findall( ur'\s((?:and|to|2|for|fr|arriving|arrive|reaching|reach|rch)' ur'\s+([A-Za-z\u0900-\u097F]+)\s+(?:from|frm|departing|depart|leaving|leave)' ur'\s+([A-Za-z\u0900-\u097F]+))\.?', self.processed_text.lower(), re.UNICODE) for pattern in patterns: city_dict_list.extend( self._city_dict_from_text(text=pattern[2], from_property=True)) city_dict_list.extend( self._city_dict_from_text(text=pattern[1], to_property=True)) return city_dict_list def _detect_departure_city(self): """ Finds departure type cities in the given text by matching few keywords like 'from', 'departing', 'leaving', 'departure city', 'departing', 'going to' . It detects dates in the part of text right to these keywords. Args: None Returns: The list of dictionary containing the dictionary for city which is detected as departure_city. For departure city the key "from" will be set to True. """ city_dict_list = [] patterns = re.findall( ur'\s((from|frm|departing|depart|leaving|leave|origin city\:|departure city\:|going to)' ur'\s+([A-Za-z\u0900-\u097F]+))\.?\s', self.processed_text.lower(), re.UNICODE) for pattern in patterns: city_dict_list.extend( self._city_dict_from_text(text=pattern[2], from_property=True)) return city_dict_list def _detect_arrival_city(self): """ Finds return type dates in the given text by matching few keywords like 'arriving', 'arrive', 'reaching', 'reach', 'destination city:' . It detects city in the part of text right to these keywords. Args: None Returns: The list of dictionary containing the dictionary for city which is detected as arrival_city. for arrival city the key "to" will be set to True. """ city_dict_list = [] patterns_1 = re.findall( ur'\s((to|2|for|fr|arriving|arrive|reaching|' ur'reach|rch|destination city\:|arrival city\:)' ur'\s+([A-Za-z\u0900-\u097F]+))\.?\s', self.processed_text.lower(), re.UNICODE) patterns_2 = re.findall( ur'([A-Za-z\u0900-\u097F]+)\s+(jana|jaana|jau|ghum|ghoom|जाना|जाऊं|जाऊँ|घूम)', self.processed_text.lower(), re.UNICODE) for pattern in patterns_1: city_dict_list.extend( self._city_dict_from_text(text=pattern[2], to_property=True)) for pattern in patterns_2: city_dict_list.extend( self._city_dict_from_text(text=pattern[0], to_property=True)) return city_dict_list def _detect_any_city(self): """ This function makes use of bot_message. In a chatbot user might just enter city name based on the previous question asked by the bot. So, if the previous question asked by the bot contains words like departure city, origin city, origin and if the current message contains city then we assign the detected city as departure_city. if the previous message contains words like arrival city, destination city, flying to in the bots message and the current message contains the city then we assign the detected city as arrival city Args: None Returns: The list of dictionary containing the dictionary for city which is detected as departure_city and city that got detected as arrival_city. For departure city the key "from" will be set to True. Whereas for arrival city the key "to" will be set to True. """ city_dict_list = [] departure_city_flag = False arrival_city_flag = False if self.bot_message: hinglish_departure = u'कहां से' departure_regexp = re.compile( ur'departure city|origin city|origin|' ur'traveling from|leaving from|flying from|travelling from|' + hinglish_departure) hinglish_arrival = u'कहां जाना|\u0916\u093c\u0924\u092e|\u0959\u0924\u092e' # unicode for ख़तम arrival_regexp = re.compile( ur'traveling to|travelling to|arrival city|' ur'arrival|destination city|destination|leaving to|flying to|' + hinglish_arrival) if departure_regexp.search(self.bot_message) is not None: departure_city_flag = True elif arrival_regexp.search(self.bot_message) is not None: arrival_city_flag = True patterns = re.findall(ur'\s((.+))\.?', self.processed_text.lower(), re.UNICODE) for pattern in patterns: pattern = list(pattern) city_dict_list = self._city_dict_from_text(text=pattern[1]) if city_dict_list: if len(city_dict_list) > 1: city_dict_list[0][ detector_constant.CITY_FROM_PROPERTY] = True city_dict_list[-1][ detector_constant.CITY_TO_PROPERTY] = True else: if departure_city_flag: city_dict_list[0][ detector_constant.CITY_FROM_PROPERTY] = True elif arrival_city_flag: city_dict_list[0][ detector_constant.CITY_TO_PROPERTY] = True else: city_dict_list[0][ detector_constant.CITY_NORMAL_PROPERTY] = True return city_dict_list def _city_dict_from_text(self, text, from_property=False, to_property=False, via_property=False, normal_property=False, detection_method=FROM_MESSAGE): """ Takes the text and the property values and creates a list of dictionaries based on number of cities detected Attributes: text: Text on which TextDetection needs to run on from_property: True if the text is belonging to "from" property". for example, From Mumbai to_property: True if the text is belonging to "to" property". for example, To Mumbai via_property: True if the text is belonging to "via" property". for example, via Mumbai normal_property: True if the text is belonging to "normal" property". for example, atms in Mumbai detection_method: method through which it got detected whether its through message or model Returns: It returns the list of dictionary containing the fields like detection_method, from, normal, to, text, value, via For example: [ { 'detection_method': 'message', 'from': False, 'normal': True, 'text': 'mumbai', 'to': False, 'value': u'BOM', 'via': False } ] """ city_dict_list = [] city_list, original_list = self._city_value(text=text) index = 0 for city in city_list: city_dict_list.append({ detector_constant.CITY_VALUE: city, detector_constant.ORIGINAL_CITY_TEXT: original_list[index], detector_constant.CITY_FROM_PROPERTY: from_property, detector_constant.CITY_TO_PROPERTY: to_property, detector_constant.CITY_VIA_PROPERTY: via_property, detector_constant.CITY_NORMAL_PROPERTY: normal_property, detector_constant.CITY_DETECTION_METHOD: detection_method }) index += 1 return city_dict_list def _city_value(self, text): """ Detects city from text by running TextDetection class. Args: text: message to process Returns: A tuple of two lists with first list containing the detected cities and second list containing their corresponding substrings in the given text. For example: For example: (['Mumbai'], ['bombay']) """ city_list, original_list = self.text_detection_object.detect_entity( text) return city_list, original_list def _update_processed_text(self, city_dict_list): """ Replaces detected cities with tag generated from entity_name used to initialize the object with A final string with all cities replaced will be stored in object's tagged_text attribute A string with all cities removed will be stored in object's processed_text attribute Args: original_city_strings: list of substrings of original text to be replaced with tag created from entity_name """ for city_dict in city_dict_list: self.tagged_text = self.tagged_text.replace( city_dict[detector_constant.ORIGINAL_CITY_TEXT], self.tag) self.processed_text = self.processed_text.replace( city_dict[detector_constant.ORIGINAL_CITY_TEXT], '') def set_bot_message(self, bot_message): """ Sets the object's bot_message attribute Args: bot_message: string """ self.bot_message = bot_message def convert_city_dict_in_tuple(self, entity_dict_list): """ This function takes the input as a list of dictionary and converts it into tuple which is for now the standard format of individual detector function Attributes: entity_dict_list: List of dictionary containing the detected cities from text. It contains all the necessary information like original_text, value, how its detected and properties like from, to, via and normal Returns: Returns the tuple containing list of entity_values, original_text and detection method For example: (['Mumbai'], ['bombay'], ['message']) """ entity_list, original_list, detection_list = [], [], [] for entity_dict in entity_dict_list: entity_list.append({ detector_constant.CITY_VALUE: entity_dict[detector_constant.CITY_VALUE], detector_constant.CITY_FROM_PROPERTY: entity_dict[detector_constant.CITY_FROM_PROPERTY], detector_constant.CITY_TO_PROPERTY: entity_dict[detector_constant.CITY_TO_PROPERTY], detector_constant.CITY_VIA_PROPERTY: entity_dict[detector_constant.CITY_VIA_PROPERTY], detector_constant.CITY_NORMAL_PROPERTY: entity_dict[detector_constant.CITY_NORMAL_PROPERTY], }) original_list.append( entity_dict[detector_constant.ORIGINAL_CITY_TEXT]) detection_list.append( entity_dict[detector_constant.CITY_DETECTION_METHOD]) return entity_list, original_list, detection_list def _city_model_detection(self): """ This function calls run_model functionality from class Models() and verifies the values returned by it through datastore. If the cities provided by the model are present in the datastore, it sets the value to FROM_MODEL_VERIFIED else FROM_MODEL_NOT_VERFIED is set. For Example: Note: before calling this method you need to call set_bot_message() to set a bot message. self.bot_message = 'Please help me with your departure city?' self.text = 'mummbai Output: [ { 'city':'mumbai', 'original_text': 'mummbai', 'from': true, 'to': false, 'via': false, 'normal': false 'detection_method': model_verified } ] For Example: self.bot_message = 'Please help me with your departure city?' self.text = 'dehradun' Output: [ { 'city':'dehradun', 'original_text': 'dehradun', 'from': true, 'to': false, 'via': false, 'normal': false 'detection_method': model_not_verified } ] Note: Dehradun is not present in out datastore so it will take original value as entity value. """ city_dict_list = [] model_object = Models() model_output = model_object.run_model( entity_type=model_constant.CITY_ENTITY_TYPE, bot_message=self.bot_message, user_message=self.text) for output in model_output: entity_value_list, original_text_list = self._city_value( text=output[model_constant.MODEL_CITY_VALUE]) if entity_value_list: city_value = entity_value_list[0] detection_method = FROM_MODEL_VERIFIED else: city_value = output[model_constant.MODEL_CITY_VALUE] detection_method = FROM_MODEL_NOT_VERIFIED city_dict_list.append({ detector_constant.CITY_VALUE: city_value, detector_constant.ORIGINAL_CITY_TEXT: output[model_constant.MODEL_CITY_VALUE], detector_constant.CITY_FROM_PROPERTY: output[model_constant.MODEL_CITY_FROM], detector_constant.CITY_TO_PROPERTY: output[model_constant.MODEL_CITY_TO], detector_constant.CITY_VIA_PROPERTY: output[model_constant.MODEL_CITY_VIA], detector_constant.CITY_NORMAL_PROPERTY: output[model_constant.MODEL_CITY_NORMAL], detector_constant.CITY_DETECTION_METHOD: detection_method }) return city_dict_list
class LocationDetector(object): """ LocationDetector detects city from the text it similar to TextDetection and inherits TextDetection to perform its operation. (NOTE: We will be updating this detection type with better one) Attributes: text: string to extract entities from entity_name: string by which the detected city entities would be replaced with on calling detect_entity() text_dict: dictionary to store lemmas, stems, ngrams used during detection process tagged_text: string with city entities replaced with tag defined by entity_name text_entity: list to store detected entities from the text original_location_entity: list of substrings of the text detected as entities processed_text: string with detected time entities removed tag: entity_name prepended and appended with '__' """ def __init__(self, entity_name): self.text = '' self.text_dict = {} self.tagged_text = '' self.processed_text = '' self.location = [] self.original_location_text = [] self.text_detection_object = TextDetector(entity_name=entity_name) self.user_address = None self.user_lat_long = None self.user_location_updated_at = None def detect_location(self): """ Takes a message and writtens the list of location present in the text :return: tuple (list of location , original text) """ location_list = [] original_list = [] location_list, original_list = self.detect_location_format( location_list, original_list) return location_list, original_list def detect_entity(self, text=None, profile_check=True, user_address=None, user_lat_long=None, user_location_updated=None): """ Take text and returns location details :param text: :param profile_check: :param user_address: :param user_lat_long: :param user_location_updated: :return: tuple (list of location , original text) """ if text: self.text = ' ' + text + ' ' self.processed_text = self.text self.tagged_text = self.text self.profile_check = profile_check self.user_address = user_address self.user_lat_long = user_lat_long if user_location_updated: if not '+' in user_location_updated: space = user_location_updated.split(' ') if len(space) == 3: user_location_updated = space[0] + ' ' + '+'.join( space[1:]) self.user_location_updated_at = parser.parse(user_location_updated) else: self.user_location_updated_at = None location_data = self.detect_location() self.location = location_data[0] self.original_location_text = location_data[1] return location_data def detect_location_format(self, location_list=[], original_list=[]): """ Detects location if it is present in the chat :param location_list: :param original_list: :return: """ location_list_from_text_entity, original_list = self.text_detection_object.detect_entity( self.text) self.tagged_text = self.text_detection_object.tagged_text self.processed_text = self.text_detection_object.processed_text for location in location_list_from_text_entity: location_list.append(location) return location_list, original_list
class CityAdvanceDetector(object): """ Detects city subject to conditions like "arrival_city" and "departure_city". These cities are returned in a dictionary with keys 'arrival_city' and 'departure_city'. This class uses TextDetector to detect the city values. This class can be used to detect cities specific to scenarios involving a departure and arrival city for example in travel related text Attributes: text: string to extract entities from entity_name: string by which the detected date entities would be replaced with on calling detect_entity() tagged_text: string with date entities replaced with tag defined by entity name processed_text: string with detected date entities removed date: list of date entities detected original_city_text: list to store substrings of the text detected as city entities tag: entity_name prepended and appended with '__' text_detector_object: TextDetector object used to detect dates in the given text bot_message: boolean, set as the outgoing bot text/message """ def __init__(self, entity_name): """ Initializes the CityAdvanceDetector object with given entity_name Args: entity_name: A string by which the detected date entity substrings would be replaced with on calling detect_entity() """ self.text = '' self.tagged_text = '' self.processed_text = '' self.city = [] self.original_city_text = [] self.entity_name = entity_name self.text_detection_object = TextDetector(entity_name=entity_name) self.bot_message = None self.tag = '__' + entity_name + '__' def detect_entity(self, text, run_model=True): """ Detects all city strings in text and returns two lists of detected city entities and their corresponding original substrings in text respectively. Args: text: string to extract city entities from run_model: Boolean True if model needs to run else False Returns: Tuple containing two lists, first containing dictionaries, each containing 'arrival_city' and 'departure_city' keys and dictionaries returned form TextDetector as their values, for each detected city, and second list containing corresponding original substrings in text Examples: Additionally this function assigns these lists to self.city and self.original_city_text attributes respectively. """ self.text = ' ' + text + ' ' self.text = self.text.lower() self.processed_text = self.text self.tagged_text = self.text city_data = [] if run_model: city_data = self._city_model_detection() if not run_model or not city_data[0]: city_data = self._detect_city() city_data = city_data + ([],) self.city = city_data[0] self.original_city_text = city_data[1] return city_data def _detect_city(self): """ Detects "departure" and "arrival" from the object's text attribute Returns: Tuple containing two lists, first containing dictionaries, each containing 'departure_city' and 'arrival_city' keys and dictionaries returned form TextDetector as their values, for each detected date, and second list containing corresponding original substrings in text """ # print 'detection for default task' city_list = [] original_list = [] city_list, original_list = self._detect_departure_arrival_city_prepositions(city_list, original_list) self._update_processed_text(original_list) city_list, original_list = self._detect_departure_arrival_city(city_list, original_list) self._update_processed_text(original_list) city_list, original_list = self._detect_arrival_departure_city(city_list, original_list) self._update_processed_text(original_list) city_list, original_list = self._detect_departure_city(city_list, original_list) self._update_processed_text(original_list) city_list, original_list = self._detect_arrival_city(city_list, original_list) self._update_processed_text(original_list) city_list, original_list = self._detect_any_city(city_list, original_list) self._update_processed_text(original_list) return city_list, original_list def _detect_departure_arrival_city(self, city_list, original_list): """ Finds <any text><space(s)><'-' or 'to' or '2'><space(s)><any text> in the given text. It splits the text into two parts on '-' or 'to' or '2' and detects the departure city in the first (left) part and detects arrival city in the second (right) part Args: city_list: Optional, list to store dictionaries of detected cities original_list: Optional, list to store corresponding original substrings of text which were detected as departure and arrival type city entities Returns: Tuple containing two lists, first containing dictionaries, each containing 'departure_city' and 'arrival_city' keys and dictionaries returned form TextDetector as their values, for each detected city, and second list containing corresponding original substrings in text """ patterns = re.findall(r'\s(([A-Za-z]+)\s*(\-|to|2|and)\s*([A-Za-z]+))\.?\b', self.processed_text.lower()) for pattern in patterns: original = None city = { 'departure_city': None, 'arrival_city': None } original = pattern[0] departure_city = self._get_city_name(pattern[1]) arrival_city = self._get_city_name(pattern[3]) if departure_city and arrival_city: city['departure_city'] = departure_city city['arrival_city'] = arrival_city city_list.append(city) original_list.append(original) return city_list, original_list def _detect_departure_arrival_city_prepositions(self, city_list, original_list): """ Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given text. It splits the text into two parts on '-' or 'to' or '2' and detects the departure city in the first (left) part and detects arrival city in the second (right) part Args: city_list: Optional, list to store dictionaries of detected cities original_list: Optional, list to store corresponding original substrings of text which were detected as departure and arrival type city entities Returns: Tuple containing two lists, first containing dictionaries, each containing 'departure_city' and 'arrival_city' keys and dictionaries returned form TextDetector as their values, for each detected city, and second list containing corresponding original substrings in text """ patterns = re.findall( r'\s((?:from|frm|departing|depart|leaving|leave)\s*([A-Za-z]+)\s*(?:and|to|2|for|fr|arriving|arrive|reaching|reach|rch)\s*([A-Za-z]+))\.?\b', self.processed_text.lower()) for pattern in patterns: original = None city = { 'departure_city': None, 'arrival_city': None } original = pattern[0] departure_city = self._get_city_name(pattern[1]) arrival_city = self._get_city_name(pattern[2]) if departure_city and arrival_city: city['departure_city'] = departure_city city['arrival_city'] = arrival_city city_list.append(city) original_list.append(original) return city_list, original_list def _detect_arrival_departure_city(self, city_list, original_list): """ Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given text. It splits the text into two parts on '-' or 'to' or '2' and detects the arrival city in the first (left) part and detects departure city in the second (right) part Args: city_list: Optional, list to store dictionaries of detected cities original_list: Optional, list to store corresponding original substrings of text which were detected as departure and arrival type city entities Returns: Tuple containing two lists, first containing dictionaries, each containing 'departure_city' and 'arrival_city' keys and dictionaries returned form TextDetector as their values, for each detected city, and second list containing corresponding original substrings in text """ patterns = re.findall( r'\s((?:and|to|2|for|fr|arriving|arrive|reaching|reach|rch)\s*([A-Za-z]+)\s*(?:from|frm|departing|depart|leaving|leave)\s*([A-Za-z]+))\.?\b', self.processed_text.lower()) for pattern in patterns: original = None city = { 'departure_city': None, 'arrival_city': None } original = pattern[0] departure_city = self._get_city_name(pattern[2]) arrival_city = self._get_city_name(pattern[1]) if departure_city and arrival_city: city['departure_city'] = departure_city city['arrival_city'] = arrival_city city_list.append(city) original_list.append(original) return city_list, original_list def _detect_departure_city(self, city_list, original_list): """ Finds departure type cities in the given text by matching few keywords like 'from', 'departing', 'leaving', 'departure city', 'departing', 'going to' . It detects dates in the part of text right to these keywords. Args: city_list: Optional, list to store dictionaries of detected cities original_list: Optional, list to store corresponding original substrings of text which were detected as departure and arrival type city entities Returns: Tuple containing two lists, first containing dictionaries, each containing 'departure_city' and 'arrival_city' keys and dictionaries returned form TextDetector as their values, for each detected city, and second list containing corresponding original substrings in text """ patterns = re.findall( r'\s((from|frm|departing|depart|leaving|leave|origin city\:|departure city\:|going to)\s*([A-Za-z]+))\.?\s', self.processed_text.lower()) for pattern in patterns: original = None city = { 'departure_city': None, 'arrival_city': None } departure_city = self._get_city_name(pattern[2]) if departure_city: original = pattern[0] city['departure_city'] = departure_city city_list.append(city) original_list.append(original) return city_list, original_list def _detect_arrival_city(self, city_list, original_list): """ Finds return type dates in the given text by matching few keywords like 'arriving', 'arrive', 'reaching', 'reach', 'destination city:' . It detects city in the part of text right to these keywords. Args: city_list: Optional, list to store dictionaries of detected cities original_list: Optional, list to store corresponding original substrings of text which were detected as departure and arrival type city entities Returns: Tuple containing two lists, first containing dictionaries, each containing 'departure_city' and 'arrival_city' keys and dictionaries returned form TextDetector as their values, for each detected city, and second list containing corresponding original substrings in text """ patterns = re.findall( r'\s((to|2|for|fr|arriving|arrive|reaching|reach|rch|destination city\:|arrival city\:)\s*([A-Za-z]+))\.?\s', self.processed_text.lower()) for pattern in patterns: original = None pattern = list(pattern) city = { 'departure_city': None, 'arrival_city': None } arrival_city = self._get_city_name(pattern[2]) if arrival_city: original = pattern[0] city['arrival_city'] = arrival_city city_list.append(city) original_list.append(original) return city_list, original_list def _detect_any_city(self, city_list, original_list): """ This function makes use of bot_message. In a chatbot user might just enter city name based on the previous question asked by the bot. So, if the previous question asked by the bot contains words like departure city, origin city, origin and if the current message contains city then we assign the detected city as departure_city. if the previous message contains words like arrival city, destination city, flying to in the bots message and the current message contains the city then we assign the detected city as arrival city Args: city_list: Optional, list to store dictionaries of detected cities original_list: Optional, list to store corresponding original substrings of text which were detected as departure and arrival type city entities Returns: Tuple containing two lists, first containing dictionaries, each containing 'departure_city' and 'arrival_city' keys and dictionaries returned form TextDetector as their values, for each detected city, and second list containing corresponding original substrings in text """ departure_city_flag = False arrival_city_flag = False if self.bot_message: departure_regexp = re.compile( r'departure city|origin city|origin|traveling from|leaving from|flying from|travelling from') arrival_regexp = re.compile( r'traveling to|travelling to|arrival city|arrival|destination city|destination|leaving to|flying to') if departure_regexp.search(self.bot_message) is not None: departure_city_flag = True elif arrival_regexp.search(self.bot_message) is not None: arrival_city_flag = True patterns = re.findall(r'\s((.+))\.?\b', self.processed_text.lower()) for pattern in patterns: original = None pattern = list(pattern) city = { 'departure_city': None, 'arrival_city': None } city_selected, original_selected = self._get_city_name_list(pattern[1]) if city_selected: original = original_selected[0] if len(city_selected) > 1: city['departure_city'] = city_selected[0] city['arrival_city'] = city_selected[-1] else: if departure_city_flag and not arrival_city_flag: city['departure_city'] = city_selected[0] city['arrival_city'] = None elif not departure_city_flag and arrival_city_flag: city['departure_city'] = None city['arrival_city'] = city_selected[0] else: city['departure_city'] = city_selected[0] city['arrival_city'] = None city_list.append(city) original_list.append(original) return city_list, original_list def _get_city_name(self, text): """Returns the city name by calling TextDetection object Args: text: text on which detection needs to run Return: Name of the city """ city_list, original_list = self.text_detection_object.detect_entity(text) if city_list: return city_list[0] else: return None def _get_city_name_list(self, text): """Returns the list of cities by calling TextDetection object Args: text: text on which detection needs to run Return: list of cities along with the original text """ city_list, original_list = self.text_detection_object.detect_entity(text) if city_list: return city_list, original_list else: return None, None def _city_model_detection(self): """ This function calls get_model_output() method of PredictCRF class and verifies the values returned by it. If the departure and arrival city provided by crf are present in the datastore, it sets the value MODEL_VERIFIED else MODEL_NOT_VERFIED is set. And returns the final list of all detected items with each value containing a field to show whether the value if model verified or not. For Example: Note*: before calling this method you need to call set_bot_message() to set a bot message. self.bot_message = 'Please help me with your departure city?' self.text = 'mummbai' final values of all lists: model_output = [{'city':'mummbai', 'from': 1, 'to': 0, 'via': 0}] the for loop verifies each city in model_output list by checking whether it exists in datastore or not. If not then sets the value MODEL_NOT_VERIFIED else MODEL_VERIFIED finally it returns [[{'departure_city': 'Mumbai', 'arrival_city': None}]], ['mummbai'], [MODEL_VERIFIED] For Example: self.bot_message = 'Please help me with your departure city?' self.text = 'dehradun' final values of all lists: model_output = [{'city':'dehradun', 'from': 1, 'to': 0, 'via': 0}] Note*: Dehradun is not present in out datastore so it will take original value as entity value. finally it returns [[{'departure_city': 'dehradun', 'arrival_city': None}]], ['dehradun'], [MODEL_NOT_VERIFIED] """ predict_crf = PredictCRF() model_output = predict_crf.get_model_output(entity_type=CITY_ENTITY_TYPE, bot_message=self.bot_message, user_message=self.text) city_list, original_list, model_detection_type = [], [], [] for city_dict in model_output: city_list_from_text_entity, original_list_from_text_entity = \ self.text_detection_object.detect_entity(city_dict[CITY_VALUE]) city = { 'departure_city': None, 'arrival_city': None } if city_list_from_text_entity: if city_dict[FROM] == 1: city['departure_city'] = city_list_from_text_entity[0] elif city_dict[TO] == 1: city['arrival_city'] = city_list_from_text_entity[0] else: city['departure_city'] = city_list_from_text_entity[0] city_list.append(city) original_list.extend(original_list_from_text_entity) model_detection_type.append(MODEL_VERIFIED) else: if city_dict[FROM] == 1: city['departure_city'] = city_dict[CITY_VALUE] elif city_dict[TO] == 1: city['arrival_city'] = city_dict[CITY_VALUE] else: city['departure_city'] = city_dict[CITY_VALUE] city_list.append(city) original_list.append(city_dict[CITY_VALUE]) model_detection_type.append(MODEL_NOT_VERIFIED) self._update_processed_text(original_list) return city_list, original_list, model_detection_type def _update_processed_text(self, original_city_strings): """ Replaces detected date entities with tag generated from entity_name used to initialize the object with A final string with all date entities replaced will be stored in object's tagged_text attribute A string with all date entities removed will be stored in object's processed_text attribute Args: original_city_strings: list of substrings of original text to be replaced with tag created from entity_name """ for detected_text in original_city_strings: if detected_text: self.tagged_text = self.tagged_text.replace(detected_text, self.tag) self.processed_text = self.processed_text.replace(detected_text, '') def set_bot_message(self, bot_message): """ Sets the object's bot_message attribute Args: bot_message: string """ self.bot_message = bot_message
class NameDetector(object): """ NameDetector class detects names from text. This class uses TextDetector to detect the entity values. This class also contains templates and pos_tagger to capture names which are missed by TextDetector. Attributes: text: string to extract entities from entity_name: string by which the detected person_name entities would be replaced with on calling detect_entity() tagged_text: string with city entities replaced with tag defined by entity_name processed_text: string with detected time entities removed text_detection_object: the object which is used to call the TextDetector """ def __init__(self, entity_name): """ Initializes a NameDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() """ self.entity_name = entity_name self.text = '' self.names = [] self.tagged_text = '' self.processed_text = '' self.original_name_text = [] self.text_detection_object = TextDetector(entity_name=entity_name) @staticmethod def get_format_name(name_list): """ Takes input as name_list which contains the names detected. It separates the first, middle and last names. It returns two lists: 1.Containing the names separated into first, middle and last name. 2.The original text. Args: name_list (list): List of names detected Example: ['yash', 'doshi'] Returns: ({first_name: "yash", middle_name: None, last_name: "modi"}, "yash modi") """ original_text = " ".join(name_list) first_name = name_list[0] middle_name = None last_name = None if len(name_list) > 1: last_name = name_list[-1] middle_name = " ".join(name_list[1:-1]) or None entity_value = { FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name } return [entity_value], [original_text] def text_detection_name(self): """ Makes a call to TextDetection and return the person_name detected from the elastic search. Returns: Tuple with list of names detected in TextDetection in the form of variants detected and original_text Example : my name is yash doshi ([u'dosh', u'yash'], ['doshi', 'yash']) """ return self.text_detection_object.detect_entity(text=self.text) def get_name_using_pos_tagger(self, text): """ First checks if the text contains cardinals or interrogation. Then passes the text through templates. Then returns words which are nouns or adjectives Args: text (string): The text obtained from the user. Example text= My name is yash modi Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"] """ entity_value, original_text = [], [] pos_tagger_object = POS() pattern1 = re.compile(r"name\s*(is|)\s*([\w\s]+)") pattern2 = re.compile(r"myself\s+([\w\s]+)") name_tokens = text.split(' ') tagged_names = pos_tagger_object.tag(name_tokens) pattern1_match = pattern1.findall(text) pattern2_match = pattern2.findall(text) is_question = [ word[0] for word in tagged_names if word[1].startswith('WR') or word[1].startswith('WP') or word[1].startswith('CD') ] if is_question: return entity_value, original_text if pattern1_match: entity_value, original_text = self.get_format_name( pattern1_match[0][1].split()) elif pattern2_match: entity_value, original_text = self.get_format_name( pattern2_match[0].split()) elif len(name_tokens) < 4: pos_words = [ word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ') ] if pos_words: entity_value, original_text = self.get_format_name(pos_words) return entity_value, original_text def detect_entity(self, text, bot_message=None): """ Takes text as input and returns two lists 1.entity_value in the form of first, middle and last names 2.original text. Args: text(string): the original text bot_message(string): previous bot message Example: text=my name is yash doshi Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"] """ if bot_message: if not self.context_check_botmessage(bot_message): return [], [] self.text = text self.tagged_text = self.text text_detection_result = self.text_detection_name() replaced_text = self.replace_detected_text(text_detection_result) entity_value, original_text = self.detect_person_name_entity( replaced_text) if not entity_value: entity_value, original_text = self.get_name_using_pos_tagger(text) return entity_value, original_text def replace_detected_text(self, text_detection_result): """ Replaces the detected name from text_detection_result by _<name>_ Args: text_detection_result: tuple of detected names from TextDetection consisting of two lists 1.The variants detected 2.The original text ([u'dosh', u'yash'], ['doshi', 'yash']) Example: text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash']) Returns: ['my', 'name', 'is', 'yash', 'doshi'] """ replaced_text = Tokenizer().tokenize(self.text.lower()) for detected_original_text in (text_detection_result[1]): for j in range(len(replaced_text)): replaced_text[j] = replaced_text[j].replace( detected_original_text, "_" + detected_original_text + "_") return replaced_text def detect_person_name_entity(self, replaced_text): """ Separates the detected names into first, middle and last names. Returns in form of two lists entity_value and original_text Args: replaced_text: text in which names detected from TextDetector are replaced by _<name>_ Example: replaced_text = My name is _yash_ _modi_ Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], [ "yash modi"] """ original_text, entity_value = [], [] name_list = [] name_holder = [] for each in replaced_text: if each.startswith('_') and each.endswith('_'): name_holder.append(each.replace('_', '')) else: if name_holder: name_list.append(name_holder) name_holder = [] if name_holder: name_list.append(name_holder) for name in name_list: name_entity_value, original_text_value = self.get_format_name(name) original_text.extend(original_text_value) entity_value.extend(name_entity_value) return entity_value, original_text @staticmethod def context_check_botmessage(botmessage): """ Checks if previous botmessage conatins name as a keyword or not Args: botmessage: it consists of the previous botmessage Example: what is your name ? Returns: True """ if "name" in botmessage: return True return False
class NameDetector(object): """ NameDetector class detects names from text. This class uses TextDetector to detect the entity values. This class also contains templates and pos_tagger to capture names which are missed by TextDetector. Attributes: text: string to extract entities from entity_name: string by which the detected person_name entities would be replaced with on calling detect_entity() tagged_text: string with city entities replaced with tag defined by entity_name processed_text: string with detected time entities removed text_detection_object: the object which is used to call the TextDetector """ def __init__(self, entity_name, language=ENGLISH_LANG): """ Initializes a NameDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() """ self.entity_name = entity_name self.language = language self.text = '' self.names = [] self.tagged_text = '' self.processed_text = '' self.original_name_text = [] self.tag = '_' + entity_name + '_' self.text_detection_object = TextDetector(entity_name=entity_name) @staticmethod def get_format_name(name_list): """ Takes input as name_list which contains the names detected. It separates the first, middle and last names. It returns two lists: 1.Containing the names separated into first, middle and last name. 2.The original text. Args: name_list (list): List of names detected Example: ['yash', 'doshi'] Returns: ({first_name: "yash", middle_name: None, last_name: "modi"}, "yash modi") """ original_text = " ".join(name_list) first_name = name_list[0] middle_name = None last_name = None if len(name_list) > 1: last_name = name_list[-1] middle_name = " ".join(name_list[1:-1]) or None entity_value = { FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name } return [entity_value], [original_text] def text_detection_name(self): """ Makes a call to TextDetection and return the person_name detected from the elastic search. Returns: Tuple with list of names detected in TextDetection in the form of variants detected and original_text Example : my name is yash doshi ([u'dosh', u'yash'], ['doshi', 'yash']) """ return self.text_detection_object.detect_entity(text=self.text) def get_name_using_pos_tagger(self, text): """ First checks if the text contains cardinals or interrogation. Then passes the text through templates. Then returns words which are nouns or adjectives Args: text (string): The text obtained from the user. Example text= My name is yash modi Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"] """ entity_value, original_text = [], [] pos_tagger_object = POS() pattern1 = re.compile(r"name\s*(is|)\s*([\w\s]+)") pattern2 = re.compile(r"myself\s+([\w\s]+)") pattern3 = re.compile(r"call\s+me\s+([\w\s]+)") name_tokens = text.split(' ') tagged_names = pos_tagger_object.tag(name_tokens) pattern1_match = pattern1.findall(text) pattern2_match = pattern2.findall(text) pattern3_match = pattern3.findall(text) is_question = [ word[0] for word in tagged_names if word[1].startswith('WR') or word[1].startswith('WP') or word[1].startswith('CD') ] if is_question: return entity_value, original_text if pattern1_match: entity_value, original_text = self.get_format_name( pattern1_match[0][1].split()) elif pattern2_match: entity_value, original_text = self.get_format_name( pattern2_match[0].split()) elif pattern3_match: entity_value, original_text = self.get_format_name( pattern3_match[0].split()) elif len(name_tokens) < 4: pos_words = [ word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ') ] if pos_words: entity_value, original_text = self.get_format_name(pos_words) return entity_value, original_text def detect_entity(self, text, bot_message=None): """ Takes text as input and returns two lists 1.entity_value in the form of first, middle and last names 2.original text. Args: text(string): the original text bot_message(string): previous bot message Example: text=my name is yash doshi Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"] """ if bot_message: if not self.context_check_botmessage(bot_message): return [], [] self.text = text self.tagged_text = self.text entity_value, original_text = ([], []) if self.language == ENGLISH_LANG: entity_value, original_text = self.detect_english_name() elif self.language == HINDI_LANG: entity_value, original_text = self.detect_hindi_name() self._update_processed_text(person_name_list=original_text) return entity_value, original_text def detect_english_name(self): """ This method is used to detect English names from the provided text Returns: detect_text_lists (tuple): two dimensional tuple 1. entity_value (list): representing the entity values of names 2. original_text (list): representing the original text detected Example: text=my name is yash doshi detect_entity_english() >>[{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"] """ text_detection_result = self.text_detection_name() replaced_text = self.replace_detected_text(text_detection_result, text=self.text) entity_value, original_text = self.detect_person_name_entity( replaced_text) if not entity_value: entity_value, original_text = self.get_name_using_pos_tagger( self.text) return entity_value, original_text def detect_hindi_name(self): """ This method is used to detect Hindi names from the provided text Returns: detect_text_lists (tuple): two dimensional tuple 1. entity_value (list): representing the entity values of names 2. original_text (list): representing the original text detected Examples: text = u'प्रतिक श्रीदत्त जयराओ' detect_entity_hindi(text=text) >> [{first_name: u"प्रतिक", middle_name: u"श्रीदत्त", last_name: u"जयराओ"}], [ u'प्रतिक श्रीदत्त जयराओ'] """ if self.detect_abusive_phrases_hindi( text=self.text) or self.detect_question_hindi(text=self.text): return [], [] text = self.remove_emojis(text=self.text) regex = re.compile(ur'[^\u0900-\u097F\s]+', re.U) text = regex.sub(string=text, repl='') regex_detection_result = self.get_hindi_names_from_regex(text=text) replaced_text = self.replace_detected_text(regex_detection_result, text=text) entity_value, original_text = self.detect_person_name_entity( replaced_text) if not entity_value: entity_value, original_text = self.get_hindi_names_without_regex( text=text) return entity_value, original_text def replace_detected_text(self, text_detection_result, text): """ Replaces the detected name from text_detection_result by _<name>_ Args: text_detection_result: tuple of detected names from TextDetection consisting of two lists 1.The variants detected 2.The original text ([u'dosh', u'yash'], ['doshi', 'yash']) Example: text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash']) Returns: ['my', 'name', 'is', 'yash', 'doshi'] """ replaced_text = [] if self.language == ENGLISH_LANG: replaced_text = nltk_tokenizer.tokenize(text.lower()) elif self.language == HINDI_LANG: replaced_text = text.lower().strip().split() for detected_original_text in (text_detection_result[1]): for j in range(len(replaced_text)): replaced_text[j] = replaced_text[j].replace( detected_original_text, "_" + detected_original_text + "_") return replaced_text def detect_person_name_entity(self, replaced_text): """ Separates the detected names into first, middle and last names. Returns in form of two lists entity_value and original_text Args: replaced_text: text in which names detected from TextDetector are replaced by _<name>_ Example: replaced_text = My name is _yash_ _modi_ Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], [ "yash modi"] """ original_text, entity_value = [], [] name_list = [] name_holder = [] for each in replaced_text: if each.startswith('_') and each.endswith('_'): name_holder.append(each.replace('_', '')) else: if name_holder: name_list.append(name_holder) name_holder = [] if name_holder: name_list.append(name_holder) for name in name_list: name_entity_value, original_text_value = self.get_format_name(name) original_text.extend(original_text_value) entity_value.extend(name_entity_value) return entity_value, original_text def context_check_botmessage(self, botmessage): """ Checks if previous botmessage conatins name as a keyword or not Args: botmessage: it consists of the previous botmessage Example: what is your name ? Returns: True """ regex_pattern = re.compile(r'[\|\,+\:\?\!\"\(\)!\'\.\%\[\]]+') botmessage = regex_pattern.sub(r'', botmessage) botmessage = " " + botmessage.lower().strip() + " " for variant in NAME_VARIATIONS: if " " + variant + " " in botmessage: return True return False def get_hindi_names_from_regex(self, text): """ This method is used to detect hindi names which obey the regexes Args: text (str): text from which hindi names obeying the regex have to be extracted Returns: detect_text_lists (tuple): two dimensional tuple 1. text_list (list): representing the detected text 2. text_list (list): representing the original text Examples: text = u'मेरा नाम प्रतिक श्रीदत्त जयराओ है' get_hindi_text_from_regex(text=text) >>([u'प्रतिक', u'श्रीदत्त', u'जयराओ'], [u'प्रतिक', u'श्रीदत्त', u'जयराओ']) """ text_list = self.get_hindi_text_from_regex(text=text) detected_names = [] if text_list: for each in text_list: if each: detected_names.extend(each.split()) text_list = detected_names return text_list, text_list def get_hindi_names_without_regex(self, text): """ This method is used to get detect hindi names without any regex pattern (This method is called only if detection from regex patterns fails) Args: text (str): the text from which hindi text has to be detected Returns: person_name (tuple): two dimensional tuple 1. entity_value (list): representing the entity values of names 2. original_text (list): representing the original text detected Example: text = u'प्रतिक श्रीदत्त जयराओ' get_hindi_names_without_regex(text=text) >> [{first_name: u"प्रतिक", middle_name: u"श्रीदत्त", last_name: u"जयराओ"}], [ u'प्रतिक श्रीदत्त जयराओ'] """ text = self.replace_stopwords_hindi(text) original_text_list = text.strip().split() if len(original_text_list) > 4: original_text_list = [] replaced_text = self.replace_detected_text( (original_text_list, original_text_list), text=text) return self.detect_person_name_entity(replaced_text=replaced_text) def get_hindi_text_from_regex(self, text): """ This method is used to detect hindi names using regexes from the given text Args: text (str): text from which hindi names which follow the regex pattern have to be extracted Returns: pattern_match (list): list consisting of detected words Examples: text = u'मेरा नाम प्रतिक श्रीदत्त जयराओ है' get_hindi_text_from_regex(text=text) >>[u'प्रतिक श्रीदत्त जयराओ'] """ regex_list = [ ur"(?:मुझे|हमें|मुझको|हमको|हमे)\s+(?:लोग)\s+([\u0900-\u097F\s]+)" ur"\s+(?:नाम\sसे)\s+(?:कहते|बुलाते|बुलाओ)", ur"(?:नाम|मैं|हम|मै)\s+([\u0900-\u097F\s]+)", ur"(?:मुझे|हमें|मुझको|हमको|हमे)\s+([\u0900-\u097F\s]+)(?:कहते|बुलाते|बुलाओ)", ur"\s*([\u0900-\u097F\s]+)(?:मुझे|मैं|मै)(?:कहते|बुलाते|बुलाओ)?" ] for regex in regex_list: regex_ = re.compile(regex, re.U) pattern_match = regex_.findall(text) pattern_match = [ self.replace_stopwords_hindi(x) for x in pattern_match if x ] if pattern_match: if pattern_match[0]: return pattern_match return None def replace_stopwords_hindi(self, text): """ This method is used to replace hindi stop words from the text Args: text (str): The text from which hindi stop words have to be removed Returns: clean_text (str): text from which hindi stop words have been removed """ split_list = text.split(" ") split_list = [ word for word in split_list if word not in HINDI_STOPWORDS ] if split_list: return " ".join(split_list) return "" def detect_abusive_phrases_hindi(self, text): """ This method is used to check for hindi abuses in the sentence Args: text (str): text in which abuses have to be checked Returns: status (bool): returns if the text consists of abuses """ text = ' ' + text + ' ' for abuse in HINDI_BADWORDS: if ' ' + abuse + ' ' in text: return True return False def remove_emojis(self, text): """ This method is used to remove emojis from the given text Args: text (str): the text from which the emojis have to be removed Returns: text (str): text with emojis replaced with '' """ emoji_pattern = re.compile( ur'[{0}]+'.format(''.join(EMOJI_RANGES.values())), re.UNICODE) text = emoji_pattern.sub(repl='', string=text) return text def detect_question_hindi(self, text): """ This method is used to detect if the given text has a hindi question present in it Args: text (str): the text for which the question check has to be run Returns: status (bool): returns if the text has a question in it """ for word in text.split(): if word in HINDI_QUESTIONWORDS: return True return False def _update_processed_text(self, person_name_list): """ Replaces detected date with tag generated from entity_name used to initialize the object with A final string with all dates replaced will be stored in object's tagged_text attribute A string with all dates removed will be stored in object's processed_text attribute Args: person_name_list (list): list of substrings of original text to be replaced with tag created from entity_name """ for detected_text in person_name_list: self.tagged_text = self.tagged_text.replace( detected_text, self.tag) self.processed_text = self.processed_text.replace( detected_text, '')
def get_text(message, entity_name, structured_value, fallback_value, bot_message): """This functionality calls the TextDetector class to detect textual entities Attributes: NOTE: Explained above Output: NOTE: Explained above For Example: message = 'i want to order chinese from mainland china and pizza from domminos' entity_name = 'restaurant' structured_value = None fallback_value = None bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output >> [{'detection': 'message', 'original_text': 'mainland china', 'entity_value': {'value': u'Mainland China'}}, {'detection': 'message', 'original_text': 'domminos', 'entity_value': {'value': u"Domino's Pizza"}}] message = 'i wanted to watch movie' entity_name = 'movie' structured_value = 'inferno' fallback_value = None bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output >> [{'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] message = 'i wanted to watch inferno' entity_name = 'movie' structured_value = 'delhi' fallback_value = None bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] """ text_detection = TextDetector(entity_name=entity_name) if structured_value: text_entity_list, original_text_list = text_detection.detect_entity(structured_value) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED) else: return output_entity_dict_value(structured_value, structured_value, FROM_STRUCTURE_VALUE_NOT_VERIFIED) else: text_entity_list, original_text_list = text_detection.detect_entity(message) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE) elif fallback_value: return output_entity_dict_value(fallback_value, fallback_value, FROM_FALLBACK_VALUE) return None
class BudgetDetector(BaseDetector): """Detects budget from the text and tags them. Detects the budget from the text and replaces them by entity_name. This detection logic first checks for budget using regular expressions and also uses TextDetector class to extract data in textual format (i.e. Hundred, Thousand, etc). This detector captures additional attributes like max_budget, min_budget whether the budget is normal_budget (detected through regex) or text_budget (detected through text detection) For Example: budget_detection = BudgetDetector('budget') message = "shirts between 2000 to 3000" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}] -- ['2000 to 3000'] Tagged text: shirts between __budget__ budget_detection = BudgetDetector('budget') message = "tshirts less than 2k" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': 0}] -- ['less than 2k'] Tagged text: tshirts __budget__ budget_detection = BudgetDetector('budget') message = "tshirts greater than 2k" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}] -- ['greater than 2k'] Tagged text: tshirts __budget__ budget_detection = BudgetDetector('budget') message = "jeans of Rs. 1000" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}] -- ['rs. 1000'] Tagged text: ' jeans of __budget__ ' Attributes: min_digit: minimum digit that a budget can take by default it is set to 2. So, the NER will detect number as budget if its greater then 9 max_digit: maximum digit that buget can take by default it is set to 5. So, the NER will detect number as budget if its less than 99999 text: string to extract entities from entity_name: string by which the detected size would be replaced with on calling detect_entity() tagged_text: string with size replaced with tag defined by entity name processed_text: string with sizes detected removed budget: list of budgets detected original_budget_text: list to store substrings of the text detected as budget tag: entity_name prepended and appended with '__' regex_object: regex object that is used to substitute k with 000 i.e. if text contains 2k then it will be substituted as 2000 text_detection_object: text detection object to detect text in Textual format Note: text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False): """Initializes a BudgetDetector object Args: entity_name: A string by which the detected budget would be replaced with on calling detect_entity() """ # assigning values to superclass attributes self._supported_languages = [ENGLISH_LANG] super(BudgetDetector, self).__init__(source_language_script, translation_enabled) self.min_digit = 2 self.max_digit = 5 self.entity_name = entity_name self.text = '' self.tagged_text = '' self.processed_text = '' self.budget = [] self.original_budget_text = [] self.unit_present_list = ['k', 'l', 'm', 'c', 'h', 'th'] regx_for_units = [(r'([\d,.]+)\s*k', 1000), (r'([\d,.]+)\s*h', 1000), (r'([\d,.]+)\s*th', 1000), (r'([\d,.]+)\s*l', 100000), (r'([\d,.]+)\s*lacs?', 100000), (r'([\d,.]+)\s*lakh?', 100000), (r'([\d,.]+)\s*lakhs?', 100000), (r'([\d,.]+)\s*m', 1000000), (r'([\d,.]+)\s*million', 1000000), (r'([\d,.]+)\s*mill?', 1000000), (r'([\d,.]+)\s*c', 10000000), (r'([\d,.]+)\s*cro?', 10000000), (r'([\d,.]+)\s*crore?', 10000000), (r'([\d,.]+)\s*crores?', 10000000)] self.regex_object = RegexReplace(regx_for_units) self.tag = '__' + self.entity_name + '__' self.text_detection_object = TextDetector(entity_name=entity_name) def detect_entity(self, text): """Detects budget in the text string Args: text: string to extract entities from Returns: A tuple of two lists with first list containing the detected budgets and second list containing their corresponding substrings in the original message. For example: ([{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}], ['rs. 1000']) Additionally this function assigns these lists to self.budget and self.original_budget_text attributes respectively. """ self.text = ' ' + text + ' ' self.processed_text = self.text.lower() self.tagged_text = self.text budget_data = self._detect_budget() self.budget = budget_data[0] self.original_budget_text = budget_data[1] return budget_data @property def supported_languages(self): return self._supported_languages def _detect_budget(self): """Detects budget in the self.text Returns: A tuple of two lists with first list containing the detected budgets and second list containing their corresponding substrings in the original message. For example: input: "shirts between 2000 to 3000" output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000']) """ budget_list = [] original_list = [] budget_list, original_list = self._detect_min_max_budget( budget_list, original_list) self._update_processed_text(original_list) budget_list, original_list = self._detect_min_budget( budget_list, original_list) self._update_processed_text(original_list) budget_list, original_list = self._detect_max_budget( budget_list, original_list) self._update_processed_text(original_list) budget_list, original_list = self._detect_any_budget( budget_list, original_list) self._update_processed_text(original_list) if not budget_list: budget_list, original_list = self._detect_text_budget( budget_list, original_list) self._update_processed_text(original_list) return budget_list, original_list def _detect_min_budget(self, budget_list=None, original_list=None): """Detects minimum budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: "tshirts greater than 2k" output: ([{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}], ['greater than 2k']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] patterns = re.findall( r'(\s(above|more? than|more?|greater than|greater|abv|abov|more? den|\>\s*\=?)\s+' r'(rs.|rs|rupees|rupee)*\s*([\d.,]+\s*[klmct]?[a-z]*|[\d.,]+\s*[klmct]?[a-z]*)\s*' r'(rs.|rs|rupees|rupee|\.)?\s)', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } if any([unit in pattern[3] for unit in self.unit_present_list]): replace_comma = re.sub(',', '', pattern[3]) amount = int(self.regex_object.unit_substitute(replace_comma)) else: replace_comma = re.sub(',', '', pattern[3]) amount = int(replace_comma) if self.min_digit <= len(str(amount)) <= self.max_digit: budget['min_budget'] = amount budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_max_budget(self, budget_list=None, original_list=None): """Detects maximum budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: "tshirts less than 2k" output: ([{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': }], ['less than 2k']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] patterns = re.findall( r'(\s(max|upto|o?nly|around|below|less than|less|less den|\<\s*\=?)\s+(rs.|rs|rupees|rupee)' r'?\s*([\d.,]+\s*[klmct]?[a-z]*|[\d.,]+\s*[klmct]?[a-z]*)\s*(rs.|rs|rupees|rupee|\.)?\s)', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } if any([unit in pattern[3] for unit in self.unit_present_list]): comma_removed_unit_text = pattern[3].replace(',', '') amount = int( self.regex_object.unit_substitute(comma_removed_unit_text)) else: comma_removed_number = pattern[3].replace(',', '') amount = int(comma_removed_number) if self.min_digit <= len(str(amount)) <= self.max_digit: budget['max_budget'] = amount budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_min_max_budget(self, budget_list=None, original_list=None): """Detects both minimum and maximum budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: shirts between 2000 to 3000 output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] patterns = re.findall( r'(\s(([\d,.]+\s*[klmct]?[a-z]*)|([\d,.]+\s*[klmct]?[a-z]*))\s*(\-|to|and)\s*' r'(([\d,.]+\s*[klmct]?[a-z]*)|([\d,.]+\s*[klmct]?[a-z]*))\.?\s)', self.processed_text.lower()) for pattern in patterns: original = None pattern = list(pattern) budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } flag_contains_k = False max_budget = 0 min_budget = 0 _min_budget = 0 if pattern[6]: if any([unit in pattern[6] for unit in self.unit_present_list]): flag_contains_k = True else: flag_contains_k = False comma_removed_unit_text = pattern[6].replace(',', '') max_budget = int( self.regex_object.unit_substitute(comma_removed_unit_text)) elif pattern[7]: comma_removed_number = pattern[7].replace(',', '') max_budget = int(comma_removed_number) min_budget = 0 if pattern[2]: _comma_removed_unit_text = pattern[2].replace(',', '') _min_budget = int( self.regex_object.unit_substitute( _comma_removed_unit_text)) if flag_contains_k: for u in self.unit_present_list: if u in pattern[6]: pattern[2] = str(pattern[2]).strip() + u break comma_removed_unit_text = pattern[2].replace(',', '') min_budget = int( self.regex_object.unit_substitute(comma_removed_unit_text)) elif pattern[3]: comma_removed_number = pattern[3].replace(',', '') min_budget = int(comma_removed_number) if min_budget > max_budget: min_budget = _min_budget min_budget = min_budget if self.min_digit <= len( str(min_budget)) <= self.max_digit else 0 max_budget = max_budget if self.min_digit <= len( str(max_budget)) <= self.max_digit else 0 if min_budget != 0 and max_budget != 0 and min_budget <= max_budget: original = pattern[0].strip() budget['min_budget'] = min_budget budget['max_budget'] = max_budget budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_any_budget(self, budget_list=None, original_list=None): """Detects a budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: shirts between 2000 to 3000 output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] text = self.processed_text.lower().strip() units_patterns = [ r'k|hazaa?r|haja?ar|thousand', r'l|lacs?|lakh?|lakhs?', r'm|million|mill?', r'cro?|cror?|crore?|crores?' ] units_order = [1e3, 1e5, 1e6, 1e7] full = re.compile( r'((rs.|rs|rupees|rupee)?\s*((\d+((\,|\.)\d+)+)|(0|[1-9]\d*)?(\.\d+)?(?<=\d))' r'\s*(' + r'|'.join(units_patterns) + r')?\s*(rs.|rs|rupees|rupee)?)\b') units_patterns = map(lambda s: '^' + s, units_patterns) units_patterns = map(re.compile, units_patterns) matches = full.findall(text) for match in matches: original = match[0].strip() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } amount, unit = match[2], match[-2] if not amount: continue amount = amount.replace(',', '') _amount = amount.split('.') if len(_amount) > 1: amount = ''.join(_amount[:-1]) + '.' + _amount[-1] amount = float(amount) for i, pattern in enumerate(units_patterns): if pattern.findall(unit): amount = int(amount * units_order[i]) break amount = int(amount) if self.min_digit <= len(str(amount)) <= self.max_digit: budget['max_budget'] = amount budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_text_budget(self, budget_list=None, original_list=None): """Detects budget from text using text detection logic i.e.TextDetector This is a function which will be called when we want to detect the budget using text Returns: A tuple of two lists with first list containing the detected numbers and second list containing their corresponding substrings in the original message. """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] budget_text_list, original_text_list = self.text_detection_object.detect_entity( self.text) self.tagged_text = self.text_detection_object.tagged_text self.processed_text = self.text_detection_object.processed_text count = 0 while count < len(original_text_list): budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_TEXT } budget_list.append(budget) count += 1 return budget_list, original_list def _update_processed_text(self, original_budget_strings): """ Replaces detected budgets with self.tag generated from entity_name used to initialize the object with A final string with all budgets replaced will be stored in self.tagged_text attribute A string with all budgets removed will be stored in self.processed_text attribute Args: original_budget_strings: list of substrings of original text to be replaced with self.tag """ for detected_text in original_budget_strings: if detected_text: self.tagged_text = self.tagged_text.replace( detected_text, self.tag) self.processed_text = self.processed_text.replace( detected_text, '') def set_min_max_digits(self, min_digit, max_digit): """ Update min max digit Args: min_digit (int): min digit max_digit (int): max digit """ self.min_digit = min_digit self.max_digit = max_digit
class BudgetDetector(object): """Detects budget from the text and tags them. Detects the budget from the text and replaces them by entity_name. This detection logic first checks for budget using regular expressions and also uses TextDetector class to extract data in textual format (i.e. Hundred, Thousand, etc). This detector captures additional attributes like max_budget, min_budget whether the budget is normal_budget (detected through regex) or text_budget (detected through text detection) For Example: budget_detection = BudgetDetector('budget') message = "shirts between 2000 to 3000" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}] -- ['2000 to 3000'] Tagged text: shirts between __budget__ budget_detection = BudgetDetector('budget') message = "tshirts less than 2k" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': 0}] -- ['less than 2k'] Tagged text: tshirts __budget__ budget_detection = BudgetDetector('budget') message = "tshirts greater than 2k" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}] -- ['greater than 2k'] Tagged text: tshirts __budget__ budget_detection = BudgetDetector('budget') message = "jeans of Rs. 1000" budget_list, original_text = budget_detection.detect_entity(message) tagged_text = budget_detection.tagged_text print budget_list, ' -- ', original_text print 'Tagged text: ', tagged_text >> [{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}] -- ['rs. 1000'] Tagged text: ' jeans of __budget__ ' Attributes: min_digit: minimum digit that a budget can take by default it is set to 2. So, the NER will detect number as budget if its greater then 9 max_digit: maximum digit that buget can take by default it is set to 5. So, the NER will detect number as budget if its less than 99999 text: string to extract entities from entity_name: string by which the detected size would be replaced with on calling detect_entity() tagged_text: string with size replaced with tag defined by entity name processed_text: string with sizes detected removed budget: list of budgets detected original_budget_text: list to store substrings of the text detected as budget tag: entity_name prepended and appended with '__' regex_object: regex object that is used to substitute k with 000 i.e. if text contains 2k then it will be substituted as 2000 text_detection_object: text detection object to detect text in Textual format Note: text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ def __init__(self, entity_name): """Initializes a BudgetDetector object Args: entity_name: A string by which the detected budget would be replaced with on calling detect_entity() """ self.min_digit = 2 self.max_digit = 5 self.entity_name = entity_name self.text = '' self.tagged_text = '' self.processed_text = '' self.budget = [] self.original_budget_text = [] regex_for_thousand = [(r'(\d+)k', r'\g<1>000')] self.regex_object = Regex(regex_for_thousand) self.tag = '__' + self.entity_name + '__' self.text_detection_object = TextDetector(entity_name=ES_BUDGET_LIST) def detect_entity(self, text): """Detects budget in the text string Args: text: string to extract entities from Returns: A tuple of two lists with first list containing the detected budgets and second list containing their corresponding substrings in the original message. For example: ([{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}], ['rs. 1000']) Additionally this function assigns these lists to self.budget and self.original_budget_text attributes respectively. """ self.text = ' ' + text + ' ' self.processed_text = self.text.lower() self.tagged_text = self.text budget_data = self._detect_budget() self.budget = budget_data[0] self.original_budget_text = budget_data[1] return budget_data def _detect_budget(self): """Detects budget in the self.text Returns: A tuple of two lists with first list containing the detected budgets and second list containing their corresponding substrings in the original message. For example: input: "shirts between 2000 to 3000" output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000']) """ budget_list = [] original_list = [] budget_list, original_list = self._detect_min_max_budget( budget_list, original_list) self._update_processed_text(original_list) budget_list, original_list = self._detect_min_budget( budget_list, original_list) self._update_processed_text(original_list) budget_list, original_list = self._detect_max_budget( budget_list, original_list) self._update_processed_text(original_list) budget_list, original_list = self._detect_any_budget( budget_list, original_list) self._update_processed_text(original_list) budget_list, original_list = self._detect_text_budget( budget_list, original_list) self._update_processed_text(original_list) return budget_list, original_list def _detect_min_budget(self, budget_list=None, original_list=None): """Detects minimum budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: "tshirts greater than 2k" output: ([{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}], ['greater than 2k']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] patterns = re.findall( r'(\s(above|more? than|more?|greater than|greater|abv|abov|more? den|\>\s*\=?)\s+' r'(rs.|rs|rupees|rupee)*\s*(\d{' + str(self.min_digit) + ',' + str(self.max_digit) + '}|\d{1,' + str(self.max_digit - 3) + '}\s*k)\s*(rs.|rs|rupees|rupee|\.)?\s)', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } if 'k' in pattern[3]: budget['min_budget'] = int( self.regex_object.text_substitute(pattern[3])) else: budget['min_budget'] = int(pattern[3]) budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_max_budget(self, budget_list=None, original_list=None): """Detects maximum budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: "tshirts less than 2k" output: ([{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': }], ['less than 2k']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] patterns = re.findall( r'(\s(max|upto|o?nly|around|below|less than|less|less den|\<\s*\=?)\s+(rs.|rs|rupees|rupee)?\s*(\d{' + str(self.min_digit) + ',' + str(self.max_digit) + '}|\d{1,' + str(self.max_digit - 3) + '}\s*k)\s*(rs.|rs|rupees|rupee|\.)?\s)', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } if 'k' in pattern[3]: budget['max_budget'] = int( self.regex_object.text_substitute(pattern[3])) else: budget['max_budget'] = int(pattern[3]) budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_min_max_budget(self, budget_list=None, original_list=None): """Detects both minimum and maximum budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: shirts between 2000 to 3000 output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] patterns = re.findall( r'(\s((\d{1,' + str(self.max_digit - 3) + '}\s*k?)|(\d{' + str(self.min_digit) + ',' + str(self.max_digit) + '}))\s*(\-|to|and)\s*((\d{1,' + str(self.max_digit - 3) + '}\s*k?)|(\d{' + str(self.min_digit) + ',' + str(self.max_digit) + '}))\.?\s)', self.processed_text.lower()) for pattern in patterns: original = None pattern = list(pattern) budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } flag_contains_k = False max_budget = 0 if pattern[6]: flag_contains_k = True if 'k' in pattern[6] else False max_budget = int(self.regex_object.text_substitute(pattern[6])) elif pattern[7]: max_budget = int(pattern[7]) min_budget = 0 if pattern[2]: if flag_contains_k and 'k' not in pattern[2]: pattern[2] = str(pattern[2]).strip() + 'k' min_budget = int(self.regex_object.text_substitute(pattern[2])) elif pattern[3]: min_budget = int(pattern[3]) min_budget = min_budget if self.min_digit <= min_budget.__str__( ).__len__() <= self.max_digit else 0 max_budget = max_budget if self.min_digit <= max_budget.__str__( ).__len__() <= self.max_digit else 0 if min_budget != 0 and max_budget != 0 and min_budget <= max_budget: original = pattern[0].strip() budget['min_budget'] = min_budget budget['max_budget'] = max_budget budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_any_budget(self, budget_list=None, original_list=None): """Detects a budget from text using regex This is a function which will be called when we want to detect the budget using regex from the text Returns: A tuple of two lists with first list containing the detected budget and second list containing their corresponding substrings in the original message. For example: input: shirts between 2000 to 3000 output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000']) """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] patterns = re.findall( r'\s((rs.|rs|rupees|rupee)?\s?(\d{' + str(self.min_digit) + ',' + str(self.max_digit) + '}|\d{1,' + str(self.max_digit - 3) + '}\s*k)\s?(rs.|rs|rupees|rupee)?\.?)\s', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } if 'k' in pattern[2]: budget['max_budget'] = int( self.regex_object.text_substitute(pattern[2])) else: budget['max_budget'] = int(pattern[2]) budget_list.append(budget) original_list.append(original) return budget_list, original_list def _detect_text_budget(self, budget_list=None, original_list=None): """Detects budget from text using text detection logic i.e.TextDetector This is a function which will be called when we want to detect the budget using text Returns: A tuple of two lists with first list containing the detected numbers and second list containing their corresponding substrings in the original message. """ if budget_list is None: budget_list = [] if original_list is None: original_list = [] budget_text_list, original_text_list = self.text_detection_object.detect_entity( self.text) self.tagged_text = self.text_detection_object.tagged_text self.processed_text = self.text_detection_object.processed_text count = 0 while count < len(original_text_list): budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_TEXT } budget_list.append(budget) count += 1 if original_text_list: original_list.extend(original_text_list) return budget_list, original_list def _update_processed_text(self, original_budget_strings): """ Replaces detected budgets with self.tag generated from entity_name used to initialize the object with A final string with all budgets replaced will be stored in self.tagged_text attribute A string with all budgets removed will be stored in self.processed_text attribute Args: original_budget_strings: list of substrings of original text to be replaced with self.tag """ for detected_text in original_budget_strings: if detected_text: self.tagged_text = self.tagged_text.replace( detected_text, self.tag) self.processed_text = self.processed_text.replace( detected_text, '') def min_max_digit(self, min_digit, max_digit): self.min_digit = min_digit self.max_digit = max_digit
class ShoppingSizeDetector(object): """Detects size which are used for shopping from the text and tags them. Detects the sizes from the text and replaces them by entity_name. This detection logic will first check if text contains size in textual format (i.e. Large, XL, X-Large, etc) for this we call TextDetector class and then we have regex that will identify integer from the text For Example: size_detector = ShoppingSizeDetector("shopping_size") message = "Suggest me Medium size tshirt and jeans of 34 waist" size, original_numbers = size_detector.detect_entity(message) tagged_text = size_detector.tagged_text print size, ' -- ', original_numbers print 'Tagged text: ', tagged_text >> ['M','34'] -- ['Medium','34'] Tagged text: Suggest me __shopping_size__ size tshirt and jeans of __shopping_size__ waist Attributes: text: string to extract entities from entity_name: string by which the detected size would be replaced with on calling detect_entity() dictionary_name: name of a dictionary that stores the string data. For example: XL, L, Large, etc. This is constant and its value is size_detector tagged_text: string with size replaced with tag defined by entity name processed_text: string with sizes detected removed size: list of sizes detected original_size_text: list to store substrings of the text detected as size tag: entity_name prepended and appended with '__' Note: text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ def __init__(self, entity_name): """Initializes a ShoppingSizeDetector object Args: entity_name: A string by which the detected numbers would be replaced with on calling detect_entity() """ self.entity_name = entity_name self.dictionary_name = 'shopping_size' self.text = '' self.text_dict = {} self.tagged_text = '' self.processed_text = '' self.size = [] self.original_size_text = [] self.text_detection_object = TextDetector(entity_name=dictionary_name) self.tag = '__' + self.entity_name + '__' def detect_entity(self, text): """Detects size in the text string Args: text: string to extract entities from Returns: A tuple of two lists with first list containing the detected sizes and second list containing their corresponding substrings in the original message. For example: (['XL','M','30'], [''X-Large','Medium','30']) Additionally this function assigns these lists to self.size and self.original_size_text attributes respectively. """ self.text = ' ' + text + ' ' self.processed_text = self.text self.tagged_text = self.text size_data = self._detect_size() self.size = size_data[0] self.original_size_text = size_data[1] return size_data def _detect_size(self): """Detects size in the self.text Returns: A tuple of two lists with first list containing the detected sizes and second list containing their corresponding substrings in the original message. For example: input: Show me X-Large and Medium size tshirt and jeans of waist 34 output: (['XL','M', 34], ['X-Large', 'Medium', 34]) """ size_list = [] original_list = [] size_list, original_list = self._detect_size_from_text( size_list, original_list) self._update_processed_text(original_list) size_list, original_list = self._detect_size_from_regex( size_list, original_list) self._update_processed_text(original_list) return size_list, original_list def _detect_size_from_text(self, size_list=None, original_list=None): """Detects any size from text using text detection logic i.e.TextDetector This is a function which will be called when we want to detect the size using text Returns: A tuple of two lists with first list containing the detected numbers and second list containing their corresponding substrings in the original message. For example: input: "Suggest me shirt of size X-Large" output: (['XL'], ['X-Large']) """ if size_list is None: size_list = [] if original_list is None: original_list = [] size_list, original_list = self.text_detection_object.detect_entity( self.text) self.tagged_text = self.text_detection_object.tagged_text self.processed_text = self.text_detection_object.processed_text return size_list, original_list def _detect_size_from_regex(self, size_list=None, original_list=None): """Detects any size from text using regex This is a function which will be called when we want to detect the size using regex from the text Returns: A tuple of two lists with first list containing the detected numbers and second list containing their corresponding substrings in the original message. For example: input: "Suggest me shirt of size 30" output: (['30'], ['30']) """ if size_list is None: size_list = [] if original_list is None: original_list = [] pattern = re.search(r'(\s\d{1,2}\s)', self.processed_text.lower()) if pattern: size_list.append(pattern.group(0).strip()) original_list.append(pattern.group(0).strip()) return size_list, original_list def _update_processed_text(self, original_size_strings): """ Replaces detected sizes with self.tag generated from entity_name used to initialize the object with A final string with all sizes replaced will be stored in self.tagged_text attribute A string with all sizes removed will be stored in self.processed_text attribute Args: original_size_strings: list of substrings of original text to be replaced with self.tag """ for detected_text in original_size_strings: self.tagged_text = self.tagged_text.replace( detected_text, self.tag) self.processed_text = self.processed_text.replace( detected_text, '')
def get_text(message, entity_name, structured_value, fallback_value, bot_message): """Use TextDetector (elasticsearch) to detect textual entities Args: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity Example: message = 'i want to order chinese from mainland china and pizza from domminos' entity_name = 'restaurant' structured_value = None fallback_value = None bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output >> [{'detection': 'message', 'original_text': 'mainland china', 'entity_value': {'value': u'Mainland China'}}, {'detection': 'message', 'original_text': 'domminos', 'entity_value': {'value': u"Domino's Pizza"}}] message = 'i wanted to watch movie' entity_name = 'movie' structured_value = 'inferno' fallback_value = None bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output >> [{'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] message = 'i wanted to watch inferno' entity_name = 'movie' structured_value = 'delhi' fallback_value = None bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] """ text_detection = TextDetector(entity_name=entity_name) if structured_value: text_entity_list, original_text_list = text_detection.detect_entity(structured_value) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED) else: return output_entity_dict_list([structured_value], [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED) else: text_entity_list, original_text_list = text_detection.detect_entity(message) if text_entity_list: return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE) elif fallback_value: return output_entity_dict_list([fallback_value], [fallback_value], FROM_FALLBACK_VALUE) return None