示例#1
0
    def __init__(self,
                 entity_name,
                 source_language_script=ENGLISH_LANG,
                 translation_enabled=False):
        """Initializes a ShoppingSizeDetector object

        Args:
            entity_name: A string by which the detected numbers would be replaced with on calling detect_entity()
            source_language_script: ISO 639 code for language of entities to be detected by the instance of this class
            translation_enabled: True if messages needs to be translated in case detector does not support a
                                 particular language, else False
        """
        # assigning values to superclass attributes
        self._supported_languages = [ENGLISH_LANG]
        super(ShoppingSizeDetector, self).__init__(source_language_script,
                                                   translation_enabled)
        self.entity_name = entity_name
        self.text = ''
        self.text_dict = {}
        self.tagged_text = ''
        self.processed_text = ''
        self.size = []
        self.original_size_text = []
        self.text_detection_object = TextDetector(entity_name=self.entity_name)
        self.tag = '__' + self.entity_name + '__'
示例#2
0
def get_location(message, entity_name, structured_value, fallback_value, bot_message):
    """This functionality calls the TextDetector class to detect location

    TODO: We can improve this by creating separate class for location detection instead of using TextDetector

    Attributes:
        NOTE: Explained above

    Output:
        NOTE: Explained above

    """

    text_detection = TextDetector(entity_name=entity_name)
    if structured_value:
        text_entity_list, original_text_list = text_detection.detect_entity(structured_value)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED)
        else:
            return output_entity_dict_value(structured_value, structured_value, FROM_STRUCTURE_VALUE_NOT_VERIFIED)
    else:
        text_entity_list, original_text_list = text_detection.detect_entity(message)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE)
        elif fallback_value:
            return output_entity_dict_value(fallback_value, fallback_value, FROM_FALLBACK_VALUE)

    return None
示例#3
0
    def _detect_text_budget(self, budget_list=None, original_list=None):
        """Detects budget  from text using text detection logic i.e.TextDetector
        This is a function which will be called when we want to detect the budget using text

        Returns:
            A tuple of two lists with first list containing the detected numbers and second list containing their
            corresponding substrings in the original message.

        """
        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        text_detection_object = TextDetector(entity_name=self.entity_name)

        budget_text_list, original_text_list = text_detection_object.detect_entity(
            self.text, return_str=True)
        # FIXME: Broken/Ineffective code.
        self.tagged_text = text_detection_object.tagged_text
        self.processed_text = text_detection_object.processed_text
        for _, original_text in zip(budget_text_list, original_text_list):
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_TEXT
            }

            budget_list.append(budget)
            original_list.append(original_text)

        return budget_list, original_list
 def __init__(self, entity_name):
     self.text = ''
     self.text_dict = {}
     self.tagged_text = ''
     self.processed_text = ''
     self.location = []
     self.original_location_text = []
     self.text_detection_object = TextDetector(entity_name=entity_name)
     self.user_address = None
     self.user_lat_long = None
     self.user_location_updated_at = None
示例#5
0
    def __init__(self, entity_name):
        """
        Initializes a NameDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """
        self.entity_name = entity_name
        self.text = ''
        self.names = []
        self.tagged_text = ''
        self.processed_text = ''
        self.original_name_text = []
        self.text_detection_object = TextDetector(entity_name=entity_name)
示例#6
0
    def __init__(self, entity_name):
        """Initializes a ShoppingSizeDetector object

        Args:
            entity_name: A string by which the detected numbers would be replaced with on calling detect_entity()
        """
        self.entity_name = entity_name
        self.dictionary_name = 'shopping_size'
        self.text = ''
        self.text_dict = {}
        self.tagged_text = ''
        self.processed_text = ''
        self.size = []
        self.original_size_text = []
        self.text_detection_object = TextDetector(entity_name=dictionary_name)
        self.tag = '__' + self.entity_name + '__'
示例#7
0
    def __init__(self, entity_name):
        """
        Initializes a CityDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """

        self.entity_name = entity_name
        self.text = ''
        self.bot_message = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.city = []
        self.text_detection_object = TextDetector(entity_name=entity_name)
        self.tag = '__' + self.entity_name + '__'
示例#8
0
def text(request):
    """This functionality initializes text detection functionality to detect textual entities.

    Attributes:
        request: url parameters

    """
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME])
        fuzziness = parameters_dict[PARAMETER_FUZZINESS]
        min_token_len_fuzziness = parameters_dict[PARAMETER_MIN_TOKEN_LEN_FUZZINESS]
        text_detector = TextDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME],
                                     source_language_script=parameters_dict[PARAMETER_LANGUAGE_SCRIPT])
        ner_logger.debug('fuzziness: %s min_token_len_fuzziness %s' % (str(fuzziness), str(min_token_len_fuzziness)))
        if fuzziness:
            fuzziness = parse_fuzziness_parameter(fuzziness)
            text_detector.set_fuzziness_threshold(fuzziness)

        if min_token_len_fuzziness:
            min_token_len_fuzziness = int(min_token_len_fuzziness)
            text_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness)

        entity_output = text_detector.detect(message=parameters_dict[PARAMETER_MESSAGE],
                                             structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                             fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE],
                                             bot_message=parameters_dict[PARAMETER_BOT_MESSAGE])
        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError as e:
        ner_logger.exception('Exception for text_synonym: %s ' % e)
        return HttpResponse(status=500)
    return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
    def __init__(self, entity_name, language=lang_constant.ENGLISH_LANG):
        """
        Initializes a CityDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
            language: language code of text
        """

        self.entity_name = entity_name
        self.text = ''
        self.bot_message = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.city = []
        self.text_detection_object = TextDetector(entity_name=entity_name, source_language_script=language)
        self.tag = '__' + self.entity_name + '__'
    def __init__(self, entity_name):
        """
        Initializes the CityAdvanceDetector object with given entity_name

        Args:
            entity_name: A string by which the detected date entity substrings would be replaced with on calling
                        detect_entity()
        """

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.city = []
        self.original_city_text = []
        self.entity_name = entity_name
        self.text_detection_object = TextDetector(entity_name=entity_name)
        self.bot_message = None
        self.tag = '__' + entity_name + '__'
示例#11
0
def text(request):
    """This functionality initializes text detection functionality to detect textual entities.

    Attributes:
        request: url parameters

    """
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME])
        text_detector = TextDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME],
                                     source_language_script=parameters_dict[PARAMETER_LANGUAGE_SCRIPT])
        entity_output = text_detector.detect(message=parameters_dict[PARAMETER_MESSAGE],
                                             structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                             fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE],
                                             bot_message=parameters_dict[PARAMETER_BOT_MESSAGE])
        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError, e:
        ner_logger.debug('Exception for text_synonym: %s ' % e)
        return HttpResponse(status=400)
示例#12
0
    def __init__(self,
                 entity_name,
                 source_language_script=ENGLISH_LANG,
                 translation_enabled=False):
        """Initializes a BudgetDetector object

        Args:
            entity_name: A string by which the detected budget would be replaced with on calling detect_entity()
        """

        # assigning values to superclass attributes
        self._supported_languages = [ENGLISH_LANG]
        super(BudgetDetector, self).__init__(source_language_script,
                                             translation_enabled)

        self.min_digit = 2
        self.max_digit = 5
        self.entity_name = entity_name

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.budget = []
        self.original_budget_text = []
        self.unit_present_list = ['k', 'l', 'm', 'c', 'h', 'th']
        regx_for_units = [(r'([\d,.]+)\s*k', 1000), (r'([\d,.]+)\s*h', 1000),
                          (r'([\d,.]+)\s*th', 1000),
                          (r'([\d,.]+)\s*l', 100000),
                          (r'([\d,.]+)\s*lacs?', 100000),
                          (r'([\d,.]+)\s*lakh?', 100000),
                          (r'([\d,.]+)\s*lakhs?', 100000),
                          (r'([\d,.]+)\s*m', 1000000),
                          (r'([\d,.]+)\s*million', 1000000),
                          (r'([\d,.]+)\s*mill?', 1000000),
                          (r'([\d,.]+)\s*c', 10000000),
                          (r'([\d,.]+)\s*cro?', 10000000),
                          (r'([\d,.]+)\s*crore?', 10000000),
                          (r'([\d,.]+)\s*crores?', 10000000)]
        self.regex_object = RegexReplace(regx_for_units)
        self.tag = '__' + self.entity_name + '__'
        self.text_detection_object = TextDetector(entity_name=entity_name)
示例#13
0
    def __init__(self, entity_name):
        """Initializes a BudgetDetector object

        Args:
            entity_name: A string by which the detected budget would be replaced with on calling detect_entity()
        """

        self.min_digit = 2
        self.max_digit = 5
        self.entity_name = entity_name

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.budget = []
        self.original_budget_text = []

        regex_for_thousand = [(r'(\d+)k', r'\g<1>000')]
        self.regex_object = Regex(regex_for_thousand)
        self.tag = '__' + self.entity_name + '__'
        self.text_detection_object = TextDetector(entity_name=ES_BUDGET_LIST)
def get_location(message, entity_name, structured_value, fallback_value, bot_message):
    """"Use TextDetector (elasticsearch) to detect location

    TODO: We can improve this by creating separate for location detection instead of using TextDetector

    Args:
        message (str): natural text on which detection logic is to be run. Note if structured value is
                                detection is run on structured value instead of message
        entity_name (str): name of the entity. Also acts as elastic-search dictionary name
                           if entity uses elastic-search lookup
        structured_value (str): Value obtained from any structured elements. Note if structured value is
                                detection is run on structured value instead of message
                                (For example, UI elements like form, payload, etc)
        fallback_value (str): If the detection logic fails to detect any value either from structured_value
                          or message then we return a fallback_value as an output.
        bot_message (str): previous message from a bot/agent.


    Returns:
        dict or None: dictionary containing entity_value, original_text and detection;
                      entity_value is in itself a dict with its keys varying from entity to entity
    """

    text_detection = TextDetector(entity_name=entity_name)
    if structured_value:
        text_entity_list, original_text_list = text_detection.detect_entity(structured_value)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED)
        else:
            return output_entity_dict_list([structured_value], [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED)
    else:
        text_entity_list, original_text_list = text_detection.detect_entity(message)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE)
        elif fallback_value:
            return output_entity_dict_list([fallback_value], [fallback_value], FROM_FALLBACK_VALUE)

    return None
示例#15
0
class CityDetector(object):
    """
    CityDetector detects city from the text it similar to TextDetection and inherits TextDetection to perform its
    operation.


    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected city entities would be replaced with on calling detect_entity()
        text_dict: dictionary to store lemmas, stems, ngrams used during detection process
        tagged_text: string with city entities replaced with tag defined by entity_name
        text_entity: list to store detected entities from the text
        original_city_entity: list of substrings of the text detected as entities
        processed_text: string with detected time entities removed
        tag: entity_name prepended and appended with '__'
    """
    def __init__(self, entity_name):
        """
        Initializes a CityDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """

        self.entity_name = entity_name
        self.text = ''
        self.bot_message = ''
        self.text_dict = {}
        self.tagged_text = ''
        self.processed_text = ''
        self.city = []
        self.original_city_text = []
        self.text_detection_object = TextDetector(entity_name=entity_name)
        self.tag = '__' + self.entity_name + '__'

    def detect_city(self):
        """
        Takes a message and writtens the list of city present in the text
        :return: tuple (list of location , original text)
        """
        city_list = []
        original_list = []
        city_list, original_list = self.detect_city_format(
            city_list, original_list)
        self.update_processed_text(original_list)
        return city_list, original_list

    def detect_entity(self, text, run_model=True):
        """Detects city in the text string

        Args:
            text: string to extract entities from
            run_model: Boolean True if model needs to run else False
        Returns:
            A tuple of two lists with first list containing the detected city and second list containing their
            corresponding substrings in the given text.

            For example:

                (['Mumbai'], ['bombay'])

            Additionally this function assigns these lists to self.city and self.original_city_text attributes
            respectively.

        """
        self.text = ' ' + text + ' '
        self.text = self.text.lower()
        self.processed_text = self.text.lower()
        self.tagged_text = self.text.lower()
        city_data = []
        if run_model:
            city_data = self.city_model_detection()
        if not run_model or not city_data[0]:
            city_data = self.detect_city()
            city_data = city_data + ([], )
        self.city = city_data[0]
        self.original_city_text = city_data[1]
        return city_data

    def detect_city_format(self, city_list=[], original_list=[]):
        """
        Detects city from self.text conforming to formats defined by regex pattern.



        Args:
            city_list: Optional, list to store detected cities
            original_list: Optional, list to store corresponding substrings of given text which were detected as
                            cities

        Returns:
            A tuple of two lists with first list containing the detected cities and second list containing their
            corresponding substrings in the given text. For example:

            For example:

                (['Mumbai'], ['bombay'])
        """
        city_list_from_text_entity, original_list = self.text_detection_object.detect_entity(
            self.text)
        self.tagged_text = self.text_detection_object.tagged_text
        self.processed_text = self.text_detection_object.processed_text
        for city in city_list_from_text_entity:
            city_list.append(city)

        return city_list, original_list

    def city_model_detection(self):
        """
        This function calls get_model_output() method of PredictCRF class and verifies the values returned by it.


        If the cities provided by crf are present in the datastore, it sets the value MODEL_VERIFIED
        else MODEL_NOT_VERFIED is set.

        And returns the final list of all detected items with each value containing a field to show whether the value if verified or 
        not

        For Example:
            Note*:  before calling this method you need to call set_bot_message() to set a bot message.

            
            self.bot_message = 'Please help me with your departure city?'
            self.text = 'mummbai'

            final values of all lists:
                model_output = [{'city':'mummbai', 'from': 1, 'to': 0, 'via': 0}]

                The for loop verifies each city in model_output list by checking whether it exists in datastore or not(by running elastic search).
                If not then sets the value MODEL_NOT_VERIFIED else MODEL_VERIFIED

                finally it returns ['Mumbai'], ['mummbai'], [MODEL_VERIFIED]

        For Example:
        
            self.bot_message = 'Please help me with your departure city?'
            self.text = 'dehradun'

            final values of all lists:
                model_output = [{'city':'dehradun', 'from': 1, 'to': 0, 'via': 0}]

                Note*: Dehradun is not present in out datastore so it will take original value as entity value.

                finally it returns ['dehradun'], ['dehradun'], [MODEL_NOT_VERIFIED]

        """
        predict_crf = PredictCRF()
        model_output = predict_crf.get_model_output(
            entity_type=CITY_ENTITY_TYPE,
            bot_message=self.bot_message,
            user_message=self.text)
        city_list, original_list, model_detection_type = [], [], []
        for city_dict in model_output:
            city_list_from_text_entity, original_list_from_text_entity = \
                self.text_detection_object.detect_entity(city_dict[CITY_VALUE])
            if city_list_from_text_entity:
                city_list.extend(city_list_from_text_entity)
                original_list.extend(original_list_from_text_entity)
                model_detection_type.append(MODEL_VERIFIED)
            else:
                city_list.append(city_dict[CITY_VALUE])
                original_list.append(city_dict[CITY_VALUE])
                model_detection_type.append(MODEL_NOT_VERIFIED)
        self.update_processed_text(original_list)

        return city_list, original_list, model_detection_type

    def update_processed_text(self, original_list):
        """
        Replaces detected cities with tag generated from entity_name used to initialize the object with

        A final string with all cities replaced will be stored in object's tagged_text attribute
        A string with all cities removed will be stored in object's processed_text attribute

        Args:
            original_city_strings: list of substrings of original text to be replaced with tag created from entity_name
        """
        for detected_text in original_list:
            self.tagged_text = self.tagged_text.replace(
                detected_text, self.tag)
            self.processed_text = self.processed_text.replace(
                detected_text, '')

    def set_bot_message(self, bot_message):
        """
        Sets the object's bot_message attribute

        Args:
            bot_message: string
        """

        self.bot_message = bot_message
示例#16
0
class CityDetector(object):
    """
    CityDetector detects cities from the text. It Detects city with the properties like "from", "to", "via" and
    "normal". These cities are returned in a dictionary form that contains relevant text, its actual value
    and its attribute in boolean field i.e. "from", "to", "via", "normal".
    This class uses TextDetector to detect the entity values. It also has model integrated to it that can be used to
    extract relevant text from the text

    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected city entities would be replaced with on calling detect_entity()
        tagged_text: string with city entities replaced with tag defined by entity_name
        city: list to store detected entities from the text
        processed_text: string with detected time entities removed
        tag: entity_name prepended and appended with '__'
    """
    def __init__(self, entity_name, language):
        """
        Initializes a CityDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
            language: language code of text
        """

        self.entity_name = entity_name
        self.text = ''
        self.bot_message = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.city = []
        self.text_detection_object = TextDetector(
            entity_name=entity_name, source_language_script=language)
        self.tag = '__' + self.entity_name + '__'

    def detect_entity(self, text, run_model=False):
        """Detects city in the text string

        Args:
            text: string to extract entities from
            run_model: True if model needs to be run else False
        Returns:
            It returns the list of dictionary containing the fields like detection_method, from, normal, to,
            text, value, via

            For example:

                [
                    {
                      'detection_method': 'message',
                      'from': False,
                      'normal': True,
                      'text': 'mumbai',
                      'to': False,
                      'value': u'BOM',
                      'via': False
                    }
                ]


            Additionally this function assigns this list to self.city

        """
        self.text = ' ' + text + ' '
        self.text = self.text.lower()
        self.processed_text = self.text
        self.tagged_text = self.text
        city_data = []
        if run_model:
            city_data = self._city_model_detection()
        if not run_model or not city_data:
            city_data = self._detect_city()
        self.city = city_data
        return city_data

    def _detect_city(self):
        """
        Detects a city and categorises it into "from", "to", "via" and "normal" attributes

        Returns:
            It returns the list of dictionary containing the fields like detection_method, from, normal, to,
            text, value, via


        """
        # print 'detection for default task'
        final_city_dict_list = []
        city_dict_list = self._detect_departure_arrival_city_prepositions()
        final_city_dict_list.extend(city_dict_list)
        self._update_processed_text(city_dict_list)

        city_dict_list = self._detect_departure_arrival_city()
        final_city_dict_list.extend(city_dict_list)
        self._update_processed_text(city_dict_list)

        city_dict_list = self._detect_arrival_departure_city()
        final_city_dict_list.extend(city_dict_list)
        self._update_processed_text(city_dict_list)

        city_dict_list = self._detect_departure_city()
        final_city_dict_list.extend(city_dict_list)
        self._update_processed_text(city_dict_list)

        city_dict_list = self._detect_arrival_city()
        final_city_dict_list.extend(city_dict_list)
        self._update_processed_text(city_dict_list)

        city_dict_list = self._detect_any_city()
        final_city_dict_list.extend(city_dict_list)
        self._update_processed_text(city_dict_list)

        return final_city_dict_list

    def _detect_city_format(self):
        """


        """
        return self._city_dict_from_text(text=self.processed_text,
                                         normal_property=True)

    def _detect_departure_arrival_city(self):
        """
        Finds <any text><space(s)><'-' or 'to' or '2'><space(s)><any text> in the given text.
        It  splits the text into two parts on '-' or 'to' or '2'
        and detects the departure city in the first (left) part and detects arrival city in the second (right) part

        Args:
            None

        Returns:
            The list of dictionary containing the dictionary for city which is detected as departure_city and city
            that got detected as arrival_city. For departure city the key "from" will be set to True.
            Whereas for arrival city the key "to" will be set to True.
        """
        city_dict_list = []
        patterns = re.findall(
            ur'\s(([A-Za-z\u0900-\u097F]+)\s+(\-|to|2|se|से|and)\s+([A-Za-z\u0900-\u097F\s]+))\.?',
            self.processed_text.lower(), re.UNICODE)
        for pattern in patterns:
            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[1], from_property=True))

            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[3], to_property=True))

        return city_dict_list

    def _detect_departure_arrival_city_prepositions(self):
        """
        Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given
        text.
        It  splits the text into two parts on '-' or 'to' or '2'
        and detects the departure city in the first (left) part and detects arrival city in the second (right) part

        Args:
            None

        Returns:
            The list of dictionary containing the dictionary for city which is detected as departure_city and city
            that got detected as arrival_city. For departure city the key "from" will be set to True.
            Whereas for arrival city the key "to" will be set to True.
        """
        city_dict_list = []
        patterns = re.findall(
            ur'\s((?:from|frm|departing|depart|leaving|leave)\s+([A-Za-z\u0900-\u097F]+)'
            ur'\s+(?:and|to|se|से|2|for|fr|arriving|arrive|reaching|reach|rch)'
            ur'\s+([A-Za-z\u0900-\u097F]+))\.?', self.processed_text.lower(),
            re.UNICODE)

        for pattern in patterns:
            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[1], from_property=True))

            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[2], to_property=True))

        return city_dict_list

    def _detect_arrival_departure_city(self):
        """
        Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given
        text.
        It  splits the text into two parts on '-' or 'to' or '2'
        and detects the arrival city in the first (left) part and detects departure city in the second (right) part

        Args:
            None

        Returns:
            The list of dictionary containing the dictionary for city which is detected as departure_city and city
            that got detected as arrival_city. For departure city the key "from" will be set to True.
            Whereas for arrival city the key "to" will be set to True.

        """
        city_dict_list = []
        patterns = re.findall(
            ur'\s((?:and|to|2|for|fr|arriving|arrive|reaching|reach|rch)'
            ur'\s+([A-Za-z\u0900-\u097F]+)\s+(?:from|frm|departing|depart|leaving|leave)'
            ur'\s+([A-Za-z\u0900-\u097F]+))\.?', self.processed_text.lower(),
            re.UNICODE)

        for pattern in patterns:
            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[2], from_property=True))

            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[1], to_property=True))

        return city_dict_list

    def _detect_departure_city(self):
        """
        Finds departure type cities in the given text by matching few keywords like 'from', 'departing',
        'leaving', 'departure city', 'departing', 'going to' . It detects dates in the part of text right to these
        keywords.

        Args:
            None

        Returns:
            The list of dictionary containing the dictionary for city which is detected as departure_city.
            For departure city the key "from" will be set to True.

        """
        city_dict_list = []
        patterns = re.findall(
            ur'\s((from|frm|departing|depart|leaving|leave|origin city\:|departure city\:|going to)'
            ur'\s+([A-Za-z\u0900-\u097F]+))\.?\s', self.processed_text.lower(),
            re.UNICODE)

        for pattern in patterns:
            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[2], from_property=True))

        return city_dict_list

    def _detect_arrival_city(self):
        """
        Finds return type dates in the given text by matching few keywords like 'arriving', 'arrive',
        'reaching', 'reach', 'destination city:' . It detects city in the part of text right
        to these keywords.

        Args:
            None

        Returns:
            The list of dictionary containing the dictionary for city which is detected as arrival_city.
            for arrival city the key "to" will be set to True.

        """
        city_dict_list = []
        patterns_1 = re.findall(
            ur'\s((to|2|for|fr|arriving|arrive|reaching|'
            ur'reach|rch|destination city\:|arrival city\:)'
            ur'\s+([A-Za-z\u0900-\u097F]+))\.?\s', self.processed_text.lower(),
            re.UNICODE)
        patterns_2 = re.findall(
            ur'([A-Za-z\u0900-\u097F]+)\s+(jana|jaana|jau|ghum|ghoom|जाना|जाऊं|जाऊँ|घूम)',
            self.processed_text.lower(), re.UNICODE)
        for pattern in patterns_1:
            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[2], to_property=True))
        for pattern in patterns_2:
            city_dict_list.extend(
                self._city_dict_from_text(text=pattern[0], to_property=True))

        return city_dict_list

    def _detect_any_city(self):
        """
        This function makes use of bot_message. In a chatbot user might just enter city name based on the
        previous question asked by the bot. So, if the previous question asked by the bot contains words like
        departure city, origin city, origin and if the current message contains city then we assign the
        detected city as departure_city. if the previous message contains words like arrival city, destination city,
        flying to in the bots message and the current message contains the city then we assign the detected city as
        arrival city


        Args:
            None

        Returns:
            The list of dictionary containing the dictionary for city which is detected as departure_city and city
            that got detected as arrival_city. For departure city the key "from" will be set to True.
            Whereas for arrival city the key "to" will be set to True.

        """
        city_dict_list = []
        departure_city_flag = False
        arrival_city_flag = False
        if self.bot_message:
            hinglish_departure = u'कहां से'
            departure_regexp = re.compile(
                ur'departure city|origin city|origin|'
                ur'traveling from|leaving from|flying from|travelling from|' +
                hinglish_departure)
            hinglish_arrival = u'कहां जाना|\u0916\u093c\u0924\u092e|\u0959\u0924\u092e'  # unicode for ख़तम
            arrival_regexp = re.compile(
                ur'traveling to|travelling to|arrival city|'
                ur'arrival|destination city|destination|leaving to|flying to|'
                + hinglish_arrival)
            if departure_regexp.search(self.bot_message) is not None:
                departure_city_flag = True
            elif arrival_regexp.search(self.bot_message) is not None:
                arrival_city_flag = True

        patterns = re.findall(ur'\s((.+))\.?', self.processed_text.lower(),
                              re.UNICODE)

        for pattern in patterns:
            pattern = list(pattern)
            city_dict_list = self._city_dict_from_text(text=pattern[1])
            if city_dict_list:
                if len(city_dict_list) > 1:
                    city_dict_list[0][
                        detector_constant.CITY_FROM_PROPERTY] = True
                    city_dict_list[-1][
                        detector_constant.CITY_TO_PROPERTY] = True
                else:
                    if departure_city_flag:
                        city_dict_list[0][
                            detector_constant.CITY_FROM_PROPERTY] = True
                    elif arrival_city_flag:
                        city_dict_list[0][
                            detector_constant.CITY_TO_PROPERTY] = True
                    else:
                        city_dict_list[0][
                            detector_constant.CITY_NORMAL_PROPERTY] = True
        return city_dict_list

    def _city_dict_from_text(self,
                             text,
                             from_property=False,
                             to_property=False,
                             via_property=False,
                             normal_property=False,
                             detection_method=FROM_MESSAGE):
        """
        Takes the text and the property values and creates a list of dictionaries based on number of cities detected

        Attributes:
            text: Text on which TextDetection needs to run on
            from_property: True if the text is belonging to "from" property". for example, From Mumbai
            to_property: True if the text is belonging to "to" property". for example, To Mumbai
            via_property: True if the text is belonging to "via" property". for example, via Mumbai
            normal_property: True if the text is belonging to "normal" property". for example, atms in Mumbai
            detection_method: method through which it got detected whether its through message or model

        Returns:

            It returns the list of dictionary containing the fields like detection_method, from, normal, to,
            text, value, via

            For example:

                [
                    {
                      'detection_method': 'message',
                      'from': False,
                      'normal': True,
                      'text': 'mumbai',
                      'to': False,
                      'value': u'BOM',
                      'via': False
                    }
                ]

        """
        city_dict_list = []
        city_list, original_list = self._city_value(text=text)
        index = 0
        for city in city_list:
            city_dict_list.append({
                detector_constant.CITY_VALUE:
                city,
                detector_constant.ORIGINAL_CITY_TEXT:
                original_list[index],
                detector_constant.CITY_FROM_PROPERTY:
                from_property,
                detector_constant.CITY_TO_PROPERTY:
                to_property,
                detector_constant.CITY_VIA_PROPERTY:
                via_property,
                detector_constant.CITY_NORMAL_PROPERTY:
                normal_property,
                detector_constant.CITY_DETECTION_METHOD:
                detection_method
            })
            index += 1
        return city_dict_list

    def _city_value(self, text):
        """
        Detects city from text by running TextDetection class.

        Args:
            text: message to process
        Returns:
            A tuple of two lists with first list containing the detected cities and second list containing their
            corresponding substrings in the given text. For example:

            For example:

                (['Mumbai'], ['bombay'])
        """
        city_list, original_list = self.text_detection_object.detect_entity(
            text)
        return city_list, original_list

    def _update_processed_text(self, city_dict_list):
        """
        Replaces detected cities with tag generated from entity_name used to initialize the object with

        A final string with all cities replaced will be stored in object's tagged_text attribute
        A string with all cities removed will be stored in object's processed_text attribute

        Args:
            original_city_strings: list of substrings of original text to be replaced with tag created from entity_name
        """
        for city_dict in city_dict_list:
            self.tagged_text = self.tagged_text.replace(
                city_dict[detector_constant.ORIGINAL_CITY_TEXT], self.tag)
            self.processed_text = self.processed_text.replace(
                city_dict[detector_constant.ORIGINAL_CITY_TEXT], '')

    def set_bot_message(self, bot_message):
        """
        Sets the object's bot_message attribute

        Args:
            bot_message: string
        """

        self.bot_message = bot_message

    def convert_city_dict_in_tuple(self, entity_dict_list):
        """
        This function takes the input as a list of dictionary and converts it into tuple which is
        for now the standard format  of individual detector function

        Attributes:
            entity_dict_list: List of dictionary containing the detected cities from text. It contains all the
            necessary information like original_text, value, how its detected and properties like from, to, via and
            normal

        Returns:
            Returns the tuple containing list of entity_values, original_text and detection method

            For example:

                (['Mumbai'], ['bombay'], ['message'])

        """
        entity_list, original_list, detection_list = [], [], []
        for entity_dict in entity_dict_list:
            entity_list.append({
                detector_constant.CITY_VALUE:
                entity_dict[detector_constant.CITY_VALUE],
                detector_constant.CITY_FROM_PROPERTY:
                entity_dict[detector_constant.CITY_FROM_PROPERTY],
                detector_constant.CITY_TO_PROPERTY:
                entity_dict[detector_constant.CITY_TO_PROPERTY],
                detector_constant.CITY_VIA_PROPERTY:
                entity_dict[detector_constant.CITY_VIA_PROPERTY],
                detector_constant.CITY_NORMAL_PROPERTY:
                entity_dict[detector_constant.CITY_NORMAL_PROPERTY],
            })
            original_list.append(
                entity_dict[detector_constant.ORIGINAL_CITY_TEXT])
            detection_list.append(
                entity_dict[detector_constant.CITY_DETECTION_METHOD])
        return entity_list, original_list, detection_list

    def _city_model_detection(self):
        """
        This function calls run_model functionality from class Models() and verifies the values returned by it through
        datastore.
        If the cities provided by the model are present in the datastore, it sets the value to FROM_MODEL_VERIFIED
        else FROM_MODEL_NOT_VERFIED is set.

        For Example:
            Note:  before calling this method you need to call set_bot_message() to set a bot message.

            self.bot_message = 'Please help me with your departure city?'
            self.text = 'mummbai

            Output:
                [
                    {
                        'city':'mumbai',
                        'original_text': 'mummbai',
                        'from': true,
                        'to': false,
                        'via': false,
                        'normal': false
                        'detection_method': model_verified
                    }
                ]


        For Example:

            self.bot_message = 'Please help me with your departure city?'
            self.text = 'dehradun'

            Output:
                 [
                    {
                        'city':'dehradun',
                        'original_text': 'dehradun',
                        'from': true,
                        'to': false,
                        'via': false,
                        'normal': false
                        'detection_method': model_not_verified

                    }
                ]

                 Note: Dehradun is not present in out datastore so it will take original value as entity value.

        """
        city_dict_list = []
        model_object = Models()
        model_output = model_object.run_model(
            entity_type=model_constant.CITY_ENTITY_TYPE,
            bot_message=self.bot_message,
            user_message=self.text)
        for output in model_output:
            entity_value_list, original_text_list = self._city_value(
                text=output[model_constant.MODEL_CITY_VALUE])
            if entity_value_list:
                city_value = entity_value_list[0]
                detection_method = FROM_MODEL_VERIFIED
            else:
                city_value = output[model_constant.MODEL_CITY_VALUE]
                detection_method = FROM_MODEL_NOT_VERIFIED

            city_dict_list.append({
                detector_constant.CITY_VALUE:
                city_value,
                detector_constant.ORIGINAL_CITY_TEXT:
                output[model_constant.MODEL_CITY_VALUE],
                detector_constant.CITY_FROM_PROPERTY:
                output[model_constant.MODEL_CITY_FROM],
                detector_constant.CITY_TO_PROPERTY:
                output[model_constant.MODEL_CITY_TO],
                detector_constant.CITY_VIA_PROPERTY:
                output[model_constant.MODEL_CITY_VIA],
                detector_constant.CITY_NORMAL_PROPERTY:
                output[model_constant.MODEL_CITY_NORMAL],
                detector_constant.CITY_DETECTION_METHOD:
                detection_method
            })
        return city_dict_list
class LocationDetector(object):
    """
    LocationDetector detects city from the text it similar to TextDetection and inherits TextDetection to perform its
    operation.
    (NOTE: We will be updating this detection type with better one)

    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected city entities would be replaced with on calling detect_entity()
        text_dict: dictionary to store lemmas, stems, ngrams used during detection process
        tagged_text: string with city entities replaced with tag defined by entity_name
        text_entity: list to store detected entities from the text
        original_location_entity: list of substrings of the text detected as entities
        processed_text: string with detected time entities removed
        tag: entity_name prepended and appended with '__'
    """
    def __init__(self, entity_name):
        self.text = ''
        self.text_dict = {}
        self.tagged_text = ''
        self.processed_text = ''
        self.location = []
        self.original_location_text = []
        self.text_detection_object = TextDetector(entity_name=entity_name)
        self.user_address = None
        self.user_lat_long = None
        self.user_location_updated_at = None

    def detect_location(self):
        """
        Takes a message and writtens the list of location present in the text
        :return: tuple (list of location , original text)
        """
        location_list = []
        original_list = []
        location_list, original_list = self.detect_location_format(
            location_list, original_list)
        return location_list, original_list

    def detect_entity(self,
                      text=None,
                      profile_check=True,
                      user_address=None,
                      user_lat_long=None,
                      user_location_updated=None):
        """
        Take text and returns location details
        :param text:
        :param profile_check:
        :param user_address:
        :param user_lat_long:
        :param user_location_updated:
        :return: tuple (list of location , original text)
        """
        if text:
            self.text = ' ' + text + ' '
            self.processed_text = self.text
            self.tagged_text = self.text

        self.profile_check = profile_check
        self.user_address = user_address
        self.user_lat_long = user_lat_long
        if user_location_updated:
            if not '+' in user_location_updated:
                space = user_location_updated.split(' ')
                if len(space) == 3:
                    user_location_updated = space[0] + ' ' + '+'.join(
                        space[1:])
            self.user_location_updated_at = parser.parse(user_location_updated)
        else:
            self.user_location_updated_at = None
        location_data = self.detect_location()
        self.location = location_data[0]
        self.original_location_text = location_data[1]
        return location_data

    def detect_location_format(self, location_list=[], original_list=[]):
        """
        Detects location if it is present in the chat
        :param location_list:
        :param original_list:
        :return:
        """
        location_list_from_text_entity, original_list = self.text_detection_object.detect_entity(
            self.text)
        self.tagged_text = self.text_detection_object.tagged_text
        self.processed_text = self.text_detection_object.processed_text
        for location in location_list_from_text_entity:
            location_list.append(location)

        return location_list, original_list
class CityAdvanceDetector(object):
    """
    Detects city subject to conditions like "arrival_city" and "departure_city". These cities are returned in a
    dictionary with keys 'arrival_city' and 'departure_city'. This class uses TextDetector to detect the city values.

    This class can be used to detect cities specific to scenarios involving a departure and arrival city for example in
    travel related text

    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected date entities would be replaced with on calling detect_entity()
        tagged_text: string with date entities replaced with tag defined by entity name
        processed_text: string with detected date entities removed
        date: list of date entities detected
        original_city_text: list to store substrings of the text detected as city entities
        tag: entity_name prepended and appended with '__'
        text_detector_object: TextDetector object used to detect dates in the given text
        bot_message: boolean, set as the outgoing bot text/message
    """

    def __init__(self, entity_name):
        """
        Initializes the CityAdvanceDetector object with given entity_name

        Args:
            entity_name: A string by which the detected date entity substrings would be replaced with on calling
                        detect_entity()
        """

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.city = []
        self.original_city_text = []
        self.entity_name = entity_name
        self.text_detection_object = TextDetector(entity_name=entity_name)
        self.bot_message = None
        self.tag = '__' + entity_name + '__'

    def detect_entity(self, text, run_model=True):
        """
        Detects all city strings in text and returns two lists of detected city entities and their corresponding
        original substrings in text respectively.

        Args:
            text: string to extract city entities from
            run_model: Boolean True if model needs to run else False
        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'arrival_city'
            and 'departure_city' keys and dictionaries returned form TextDetector as their values,
            for each detected city, and second list containing corresponding original substrings in text

        Examples:

        Additionally this function assigns these lists to self.city and self.original_city_text attributes
        respectively.

        """
        self.text = ' ' + text + ' '
        self.text = self.text.lower()
        self.processed_text = self.text
        self.tagged_text = self.text
        city_data = []
        if run_model:
            city_data = self._city_model_detection()
        if not run_model or not city_data[0]:
            city_data = self._detect_city()
            city_data = city_data + ([],)
        self.city = city_data[0]
        self.original_city_text = city_data[1]
        return city_data

    def _detect_city(self):
        """
        Detects "departure" and "arrival" from the object's text attribute

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'departure_city'
            and 'arrival_city' keys and dictionaries returned form TextDetector as their values,
            for each detected date, and second list containing corresponding original substrings in text

        """
        # print 'detection for default task'
        city_list = []
        original_list = []
        city_list, original_list = self._detect_departure_arrival_city_prepositions(city_list, original_list)
        self._update_processed_text(original_list)
        city_list, original_list = self._detect_departure_arrival_city(city_list, original_list)
        self._update_processed_text(original_list)
        city_list, original_list = self._detect_arrival_departure_city(city_list, original_list)
        self._update_processed_text(original_list)
        city_list, original_list = self._detect_departure_city(city_list, original_list)
        self._update_processed_text(original_list)
        city_list, original_list = self._detect_arrival_city(city_list, original_list)
        self._update_processed_text(original_list)
        city_list, original_list = self._detect_any_city(city_list, original_list)
        self._update_processed_text(original_list)

        return city_list, original_list

    def _detect_departure_arrival_city(self, city_list, original_list):
        """
        Finds <any text><space(s)><'-' or 'to' or '2'><space(s)><any text> in the given text.
        It  splits the text into two parts on '-' or 'to' or '2'
        and detects the departure city in the first (left) part and detects arrival city in the second (right) part

        Args:
            city_list: Optional, list to store dictionaries of detected cities
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure and arrival type city entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'departure_city'
            and 'arrival_city' keys and dictionaries returned form TextDetector as their values,
            for each detected city, and second list containing corresponding original substrings in text
        """

        patterns = re.findall(r'\s(([A-Za-z]+)\s*(\-|to|2|and)\s*([A-Za-z]+))\.?\b', self.processed_text.lower())

        for pattern in patterns:
            original = None
            city = {
                'departure_city': None,
                'arrival_city': None
            }

            original = pattern[0]
            departure_city = self._get_city_name(pattern[1])
            arrival_city = self._get_city_name(pattern[3])

            if departure_city and arrival_city:
                city['departure_city'] = departure_city
                city['arrival_city'] = arrival_city

                city_list.append(city)
                original_list.append(original)

        return city_list, original_list

    def _detect_departure_arrival_city_prepositions(self, city_list, original_list):
        """
        Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given
        text.
        It  splits the text into two parts on '-' or 'to' or '2'
        and detects the departure city in the first (left) part and detects arrival city in the second (right) part

        Args:
            city_list: Optional, list to store dictionaries of detected cities
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure and arrival type city entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'departure_city'
            and 'arrival_city' keys and dictionaries returned form TextDetector as their values,
            for each detected city, and second list containing corresponding original substrings in text
        """
        patterns = re.findall(
            r'\s((?:from|frm|departing|depart|leaving|leave)\s*([A-Za-z]+)\s*(?:and|to|2|for|fr|arriving|arrive|reaching|reach|rch)\s*([A-Za-z]+))\.?\b',
            self.processed_text.lower())

        for pattern in patterns:
            original = None
            city = {
                'departure_city': None,
                'arrival_city': None
            }

            original = pattern[0]
            departure_city = self._get_city_name(pattern[1])
            arrival_city = self._get_city_name(pattern[2])

            if departure_city and arrival_city:
                city['departure_city'] = departure_city
                city['arrival_city'] = arrival_city

                city_list.append(city)
                original_list.append(original)

        return city_list, original_list

    def _detect_arrival_departure_city(self, city_list, original_list):
        """
        Finds <preposition><any text><space(s)><'-' or 'to' or '2' or preposition><space(s)><any text> in the given
        text.
        It  splits the text into two parts on '-' or 'to' or '2'
        and detects the arrival city in the first (left) part and detects departure city in the second (right) part

        Args:
            city_list: Optional, list to store dictionaries of detected cities
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure and arrival type city entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'departure_city'
            and 'arrival_city' keys and dictionaries returned form TextDetector as their values,
            for each detected city, and second list containing corresponding original substrings in text
        """
        patterns = re.findall(
            r'\s((?:and|to|2|for|fr|arriving|arrive|reaching|reach|rch)\s*([A-Za-z]+)\s*(?:from|frm|departing|depart|leaving|leave)\s*([A-Za-z]+))\.?\b',
            self.processed_text.lower())

        for pattern in patterns:
            original = None
            city = {
                'departure_city': None,
                'arrival_city': None
            }

            original = pattern[0]
            departure_city = self._get_city_name(pattern[2])
            arrival_city = self._get_city_name(pattern[1])

            if departure_city and arrival_city:
                city['departure_city'] = departure_city
                city['arrival_city'] = arrival_city

                city_list.append(city)
                original_list.append(original)

        return city_list, original_list

    def _detect_departure_city(self, city_list, original_list):
        """
        Finds departure type cities in the given text by matching few keywords like 'from', 'departing',
        'leaving', 'departure city', 'departing', 'going to' . It detects dates in the part of text right to these
        keywords.

        Args:
            city_list: Optional, list to store dictionaries of detected cities
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure and arrival type city entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'departure_city'
            and 'arrival_city' keys and dictionaries returned form TextDetector as their values,
            for each detected city, and second list containing corresponding original substrings in text
        """
        patterns = re.findall(
            r'\s((from|frm|departing|depart|leaving|leave|origin city\:|departure city\:|going to)\s*([A-Za-z]+))\.?\s',
            self.processed_text.lower())

        for pattern in patterns:
            original = None
            city = {
                'departure_city': None,
                'arrival_city': None
            }

            departure_city = self._get_city_name(pattern[2])

            if departure_city:
                original = pattern[0]
                city['departure_city'] = departure_city

                city_list.append(city)
                original_list.append(original)

        return city_list, original_list

    def _detect_arrival_city(self, city_list, original_list):
        """
        Finds return type dates in the given text by matching few keywords like 'arriving', 'arrive',
        'reaching', 'reach', 'destination city:' . It detects city in the part of text right
        to these keywords.

        Args:
            city_list: Optional, list to store dictionaries of detected cities
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure and arrival type city entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'departure_city'
            and 'arrival_city' keys and dictionaries returned form TextDetector as their values,
            for each detected city, and second list containing corresponding original substrings in text
        """
        patterns = re.findall(
            r'\s((to|2|for|fr|arriving|arrive|reaching|reach|rch|destination city\:|arrival city\:)\s*([A-Za-z]+))\.?\s',
            self.processed_text.lower())

        for pattern in patterns:
            original = None
            pattern = list(pattern)
            city = {
                'departure_city': None,
                'arrival_city': None
            }
            arrival_city = self._get_city_name(pattern[2])

            if arrival_city:
                original = pattern[0]
                city['arrival_city'] = arrival_city

                city_list.append(city)
                original_list.append(original)

        return city_list, original_list

    def _detect_any_city(self, city_list, original_list):
        """
        This function makes use of bot_message. In a chatbot user might just enter city name based on the
        previous question asked by the bot. So, if the previous question asked by the bot contains words like 
        departure city, origin city, origin and if the current message contains city then we assign the 
        detected city as departure_city. if the previous message contains words like arrival city, destination city,
        flying to in the bots message and the current message contains the city then we assign the detected city as 
        arrival city
    

        Args:
            city_list: Optional, list to store dictionaries of detected cities
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure and arrival type city entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'departure_city'
            and 'arrival_city' keys and dictionaries returned form TextDetector as their values,
            for each detected city, and second list containing corresponding original substrings in text
        """
        departure_city_flag = False
        arrival_city_flag = False
        if self.bot_message:
            departure_regexp = re.compile(
                r'departure city|origin city|origin|traveling from|leaving from|flying from|travelling from')
            arrival_regexp = re.compile(
                r'traveling to|travelling to|arrival city|arrival|destination city|destination|leaving to|flying to')
            if departure_regexp.search(self.bot_message) is not None:
                departure_city_flag = True
            elif arrival_regexp.search(self.bot_message) is not None:
                arrival_city_flag = True

        patterns = re.findall(r'\s((.+))\.?\b', self.processed_text.lower())

        for pattern in patterns:
            original = None
            pattern = list(pattern)
            city = {
                'departure_city': None,
                'arrival_city': None
            }

            city_selected, original_selected = self._get_city_name_list(pattern[1])
            if city_selected:

                original = original_selected[0]
                if len(city_selected) > 1:
                    city['departure_city'] = city_selected[0]
                    city['arrival_city'] = city_selected[-1]
                else:
                    if departure_city_flag and not arrival_city_flag:
                        city['departure_city'] = city_selected[0]
                        city['arrival_city'] = None
                    elif not departure_city_flag and arrival_city_flag:
                        city['departure_city'] = None
                        city['arrival_city'] = city_selected[0]
                    else:
                        city['departure_city'] = city_selected[0]
                        city['arrival_city'] = None

                city_list.append(city)
                original_list.append(original)

        return city_list, original_list

    def _get_city_name(self, text):
        """Returns the city name by calling TextDetection object

        Args:
            text: text on which detection needs to run

        Return:
            Name of the city

        """

        city_list, original_list = self.text_detection_object.detect_entity(text)
        if city_list:
            return city_list[0]
        else:
            return None

    def _get_city_name_list(self, text):
        """Returns the list of cities by calling TextDetection object

        Args:
            text: text on which detection needs to run

        Return:
            list of cities along with the original text
        """

        city_list, original_list = self.text_detection_object.detect_entity(text)
        if city_list:
            return city_list, original_list
        else:
            return None, None

    def _city_model_detection(self):
        """
        This function calls get_model_output() method of PredictCRF class and verifies the values returned by it.


        If the departure and arrival city provided by crf are present in the datastore, it sets the value MODEL_VERIFIED
        else MODEL_NOT_VERFIED is set.

        And returns the final list of all detected items with each value containing a field to show whether the value if model
        verified or not.

        For Example:
            Note*:  before calling this method you need to call set_bot_message() to set a bot message.
            
            self.bot_message = 'Please help me with your departure city?'
            self.text = 'mummbai'

            final values of all lists:
                model_output = [{'city':'mummbai', 'from': 1, 'to': 0, 'via': 0}]

                the for loop verifies each city in model_output list by checking whether it exists in datastore or not. If
                not then sets the value MODEL_NOT_VERIFIED else MODEL_VERIFIED

                finally it returns [[{'departure_city': 'Mumbai', 'arrival_city': None}]], ['mummbai'], [MODEL_VERIFIED]

        For Example:
        
            self.bot_message = 'Please help me with your departure city?'
            self.text = 'dehradun'

            final values of all lists:
                model_output = [{'city':'dehradun', 'from': 1, 'to': 0, 'via': 0}]

                Note*: Dehradun is not present in out datastore so it will take original value as entity value.

                finally it returns [[{'departure_city': 'dehradun', 'arrival_city': None}]], ['dehradun'], [MODEL_NOT_VERIFIED]

        """
        predict_crf = PredictCRF()
        model_output = predict_crf.get_model_output(entity_type=CITY_ENTITY_TYPE, bot_message=self.bot_message,
                                                    user_message=self.text)
        city_list, original_list, model_detection_type = [], [], []
        for city_dict in model_output:
            city_list_from_text_entity, original_list_from_text_entity = \
                self.text_detection_object.detect_entity(city_dict[CITY_VALUE])
            city = {
                'departure_city': None,
                'arrival_city': None
            }

            if city_list_from_text_entity:
                if city_dict[FROM] == 1:
                    city['departure_city'] = city_list_from_text_entity[0]
                elif city_dict[TO] == 1:
                    city['arrival_city'] = city_list_from_text_entity[0]
                else:
                    city['departure_city'] = city_list_from_text_entity[0]

                city_list.append(city)
                original_list.extend(original_list_from_text_entity)
                model_detection_type.append(MODEL_VERIFIED)
            else:
                if city_dict[FROM] == 1:
                    city['departure_city'] = city_dict[CITY_VALUE]
                elif city_dict[TO] == 1:
                    city['arrival_city'] = city_dict[CITY_VALUE]
                else:
                    city['departure_city'] = city_dict[CITY_VALUE]

                city_list.append(city)
                original_list.append(city_dict[CITY_VALUE])
                model_detection_type.append(MODEL_NOT_VERIFIED)
        self._update_processed_text(original_list)

        return city_list, original_list, model_detection_type

    def _update_processed_text(self, original_city_strings):
        """
        Replaces detected date entities with tag generated from entity_name used to initialize the object with

        A final string with all date entities replaced will be stored in object's tagged_text attribute
        A string with all date entities removed will be stored in object's processed_text attribute

        Args:
            original_city_strings: list of substrings of original text to be replaced with tag created from entity_name
        """

        for detected_text in original_city_strings:
            if detected_text:
                self.tagged_text = self.tagged_text.replace(detected_text, self.tag)
                self.processed_text = self.processed_text.replace(detected_text, '')

    def set_bot_message(self, bot_message):
        """
        Sets the object's bot_message attribute

        Args:
            bot_message: string
        """

        self.bot_message = bot_message
示例#19
0
class NameDetector(object):
    """
    NameDetector class detects names from text. This class uses TextDetector
    to detect the entity values. This class also contains templates and pos_tagger to capture
    names which are missed by TextDetector.

    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected person_name entities would be replaced with on calling detect_entity()
        tagged_text: string with city entities replaced with tag defined by entity_name
        processed_text: string with detected time entities removed
        text_detection_object: the object which is used to call the TextDetector
    """
    def __init__(self, entity_name):
        """
        Initializes a NameDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """
        self.entity_name = entity_name
        self.text = ''
        self.names = []
        self.tagged_text = ''
        self.processed_text = ''
        self.original_name_text = []
        self.text_detection_object = TextDetector(entity_name=entity_name)

    @staticmethod
    def get_format_name(name_list):
        """
        Takes input as name_list which contains the names detected.
        It separates the first, middle and last names.
        It returns two lists:
        1.Containing the names separated into first, middle and last name.
        2.The original text.

        Args:
            name_list (list): List of names detected
            Example:
                 ['yash', 'doshi']

        Returns:
        ({first_name: "yash", middle_name: None, last_name: "modi"}, "yash modi")
        """

        original_text = " ".join(name_list)

        first_name = name_list[0]
        middle_name = None
        last_name = None

        if len(name_list) > 1:
            last_name = name_list[-1]
            middle_name = " ".join(name_list[1:-1]) or None

        entity_value = {
            FIRST_NAME: first_name,
            MIDDLE_NAME: middle_name,
            LAST_NAME: last_name
        }

        return [entity_value], [original_text]

    def text_detection_name(self):
        """
        Makes a call to TextDetection and return the person_name detected from the elastic search.
        Returns:
           Tuple with list of names detected in TextDetection in the form of variants detected and original_text

         Example : my name is yash doshi

         ([u'dosh', u'yash'], ['doshi', 'yash'])
        """

        return self.text_detection_object.detect_entity(text=self.text)

    def get_name_using_pos_tagger(self, text):
        """
        First checks if the text contains cardinals or interrogation.
        Then passes the text through templates.
        Then returns words which are nouns or adjectives
        Args:
            text (string): The text obtained from the user.

            Example text= My name is yash modi
        Returns:
            [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"]
        """

        entity_value, original_text = [], []
        pos_tagger_object = POS()
        pattern1 = re.compile(r"name\s*(is|)\s*([\w\s]+)")
        pattern2 = re.compile(r"myself\s+([\w\s]+)")
        name_tokens = text.split(' ')
        tagged_names = pos_tagger_object.tag(name_tokens)
        pattern1_match = pattern1.findall(text)
        pattern2_match = pattern2.findall(text)

        is_question = [
            word[0] for word in tagged_names if word[1].startswith('WR')
            or word[1].startswith('WP') or word[1].startswith('CD')
        ]
        if is_question:
            return entity_value, original_text

        if pattern1_match:
            entity_value, original_text = self.get_format_name(
                pattern1_match[0][1].split())

        elif pattern2_match:
            entity_value, original_text = self.get_format_name(
                pattern2_match[0].split())

        elif len(name_tokens) < 4:
            pos_words = [
                word[0] for word in tagged_names
                if word[1].startswith('NN') or word[1].startswith('JJ')
            ]
            if pos_words:
                entity_value, original_text = self.get_format_name(pos_words)

        return entity_value, original_text

    def detect_entity(self, text, bot_message=None):
        """
        Takes text as input and  returns two lists
        1.entity_value in the form of first, middle and last names
        2.original text.
        Args:
           text(string): the original text
           bot_message(string): previous bot message

           Example:
                    text=my name is yash doshi
       Returns:
                [{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"]
        """

        if bot_message:
            if not self.context_check_botmessage(bot_message):
                return [], []
        self.text = text
        self.tagged_text = self.text
        text_detection_result = self.text_detection_name()
        replaced_text = self.replace_detected_text(text_detection_result)
        entity_value, original_text = self.detect_person_name_entity(
            replaced_text)

        if not entity_value:
            entity_value, original_text = self.get_name_using_pos_tagger(text)

        return entity_value, original_text

    def replace_detected_text(self, text_detection_result):
        """
        Replaces the detected name from text_detection_result by _<name>_
        Args:
            text_detection_result: tuple of detected names from TextDetection
            consisting of two lists
            1.The variants detected
            2.The original text
            ([u'dosh', u'yash'], ['doshi', 'yash'])

            Example:
                    text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash'])
            Returns:
                    ['my', 'name', 'is', 'yash', 'doshi']

        """

        replaced_text = Tokenizer().tokenize(self.text.lower())
        for detected_original_text in (text_detection_result[1]):
            for j in range(len(replaced_text)):
                replaced_text[j] = replaced_text[j].replace(
                    detected_original_text, "_" + detected_original_text + "_")

        return replaced_text

    def detect_person_name_entity(self, replaced_text):
        """
        Separates the detected names into first, middle and last names.
        Returns in form of two lists entity_value and original_text
        Args:
            replaced_text: text in which names detected from TextDetector are replaced by
        _<name>_
        Example:
                replaced_text = My name is _yash_ _modi_
        Returns:
                [{first_name: "yash", middle_name: None, last_name: "modi"}], [ "yash modi"]
        """

        original_text, entity_value = [], []
        name_list = []
        name_holder = []

        for each in replaced_text:
            if each.startswith('_') and each.endswith('_'):
                name_holder.append(each.replace('_', ''))

            else:
                if name_holder:
                    name_list.append(name_holder)
                    name_holder = []

        if name_holder:
            name_list.append(name_holder)

        for name in name_list:
            name_entity_value, original_text_value = self.get_format_name(name)
            original_text.extend(original_text_value)
            entity_value.extend(name_entity_value)

        return entity_value, original_text

    @staticmethod
    def context_check_botmessage(botmessage):
        """
        Checks if previous botmessage conatins name as a keyword or not
        Args:
            botmessage: it consists of the previous botmessage
            Example: what is your name ?
        Returns:
            True
        """

        if "name" in botmessage:
            return True
        return False
示例#20
0
class NameDetector(object):
    """
    NameDetector class detects names from text. This class uses TextDetector
    to detect the entity values. This class also contains templates and pos_tagger to capture
    names which are missed by TextDetector.

    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected person_name entities would be replaced with on calling detect_entity()
        tagged_text: string with city entities replaced with tag defined by entity_name
        processed_text: string with detected time entities removed
        text_detection_object: the object which is used to call the TextDetector
    """
    def __init__(self, entity_name, language=ENGLISH_LANG):
        """
        Initializes a NameDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """
        self.entity_name = entity_name
        self.language = language
        self.text = ''
        self.names = []
        self.tagged_text = ''
        self.processed_text = ''
        self.original_name_text = []
        self.tag = '_' + entity_name + '_'
        self.text_detection_object = TextDetector(entity_name=entity_name)

    @staticmethod
    def get_format_name(name_list):
        """
        Takes input as name_list which contains the names detected.
        It separates the first, middle and last names.
        It returns two lists:
        1.Containing the names separated into first, middle and last name.
        2.The original text.

        Args:
            name_list (list): List of names detected
            Example:
                 ['yash', 'doshi']

        Returns:
        ({first_name: "yash", middle_name: None, last_name: "modi"}, "yash modi")
        """

        original_text = " ".join(name_list)

        first_name = name_list[0]
        middle_name = None
        last_name = None

        if len(name_list) > 1:
            last_name = name_list[-1]
            middle_name = " ".join(name_list[1:-1]) or None

        entity_value = {
            FIRST_NAME: first_name,
            MIDDLE_NAME: middle_name,
            LAST_NAME: last_name
        }

        return [entity_value], [original_text]

    def text_detection_name(self):
        """
        Makes a call to TextDetection and return the person_name detected from the elastic search.
        Returns:
           Tuple with list of names detected in TextDetection in the form of variants detected and original_text

         Example : my name is yash doshi

         ([u'dosh', u'yash'], ['doshi', 'yash'])
        """

        return self.text_detection_object.detect_entity(text=self.text)

    def get_name_using_pos_tagger(self, text):
        """
        First checks if the text contains cardinals or interrogation.
        Then passes the text through templates.
        Then returns words which are nouns or adjectives
        Args:
            text (string): The text obtained from the user.

            Example text= My name is yash modi
        Returns:
            [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"]
        """

        entity_value, original_text = [], []
        pos_tagger_object = POS()
        pattern1 = re.compile(r"name\s*(is|)\s*([\w\s]+)")
        pattern2 = re.compile(r"myself\s+([\w\s]+)")
        pattern3 = re.compile(r"call\s+me\s+([\w\s]+)")
        name_tokens = text.split(' ')
        tagged_names = pos_tagger_object.tag(name_tokens)
        pattern1_match = pattern1.findall(text)
        pattern2_match = pattern2.findall(text)
        pattern3_match = pattern3.findall(text)

        is_question = [
            word[0] for word in tagged_names if word[1].startswith('WR')
            or word[1].startswith('WP') or word[1].startswith('CD')
        ]
        if is_question:
            return entity_value, original_text

        if pattern1_match:
            entity_value, original_text = self.get_format_name(
                pattern1_match[0][1].split())

        elif pattern2_match:
            entity_value, original_text = self.get_format_name(
                pattern2_match[0].split())

        elif pattern3_match:
            entity_value, original_text = self.get_format_name(
                pattern3_match[0].split())

        elif len(name_tokens) < 4:
            pos_words = [
                word[0] for word in tagged_names
                if word[1].startswith('NN') or word[1].startswith('JJ')
            ]
            if pos_words:
                entity_value, original_text = self.get_format_name(pos_words)

        return entity_value, original_text

    def detect_entity(self, text, bot_message=None):
        """
        Takes text as input and  returns two lists
        1.entity_value in the form of first, middle and last names
        2.original text.
        Args:
           text(string): the original text
           bot_message(string): previous bot message

           Example:
                    text=my name is yash doshi
       Returns:
                [{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"]
        """
        if bot_message:
            if not self.context_check_botmessage(bot_message):
                return [], []

        self.text = text
        self.tagged_text = self.text

        entity_value, original_text = ([], [])

        if self.language == ENGLISH_LANG:
            entity_value, original_text = self.detect_english_name()
        elif self.language == HINDI_LANG:
            entity_value, original_text = self.detect_hindi_name()

        self._update_processed_text(person_name_list=original_text)

        return entity_value, original_text

    def detect_english_name(self):
        """
        This method is used to detect English names from the provided text
        Returns:
            detect_text_lists (tuple): two dimensional tuple
            1. entity_value (list): representing the entity values of names
            2. original_text (list): representing the original text detected

        Example:

            text=my name is yash doshi
            detect_entity_english()
            >>[{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"]
        """

        text_detection_result = self.text_detection_name()
        replaced_text = self.replace_detected_text(text_detection_result,
                                                   text=self.text)
        entity_value, original_text = self.detect_person_name_entity(
            replaced_text)

        if not entity_value:
            entity_value, original_text = self.get_name_using_pos_tagger(
                self.text)

        return entity_value, original_text

    def detect_hindi_name(self):
        """
        This method is used to detect Hindi names from the provided text

        Returns:
            detect_text_lists (tuple): two dimensional tuple
            1. entity_value (list): representing the entity values of names
            2. original_text (list): representing the original text detected

        Examples:
            text = u'प्रतिक श्रीदत्त जयराओ'
            detect_entity_hindi(text=text)
            >> [{first_name: u"प्रतिक", middle_name: u"श्रीदत्त", last_name: u"जयराओ"}], [ u'प्रतिक श्रीदत्त जयराओ']

        """
        if self.detect_abusive_phrases_hindi(
                text=self.text) or self.detect_question_hindi(text=self.text):
            return [], []

        text = self.remove_emojis(text=self.text)
        regex = re.compile(ur'[^\u0900-\u097F\s]+', re.U)
        text = regex.sub(string=text, repl='')

        regex_detection_result = self.get_hindi_names_from_regex(text=text)
        replaced_text = self.replace_detected_text(regex_detection_result,
                                                   text=text)
        entity_value, original_text = self.detect_person_name_entity(
            replaced_text)

        if not entity_value:
            entity_value, original_text = self.get_hindi_names_without_regex(
                text=text)

        return entity_value, original_text

    def replace_detected_text(self, text_detection_result, text):
        """
        Replaces the detected name from text_detection_result by _<name>_
        Args:
            text_detection_result: tuple of detected names from TextDetection
            consisting of two lists
            1.The variants detected
            2.The original text
            ([u'dosh', u'yash'], ['doshi', 'yash'])

            Example:
                    text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash'])
            Returns:
                    ['my', 'name', 'is', 'yash', 'doshi']

        """
        replaced_text = []
        if self.language == ENGLISH_LANG:
            replaced_text = nltk_tokenizer.tokenize(text.lower())
        elif self.language == HINDI_LANG:
            replaced_text = text.lower().strip().split()

        for detected_original_text in (text_detection_result[1]):
            for j in range(len(replaced_text)):
                replaced_text[j] = replaced_text[j].replace(
                    detected_original_text, "_" + detected_original_text + "_")

        return replaced_text

    def detect_person_name_entity(self, replaced_text):
        """
        Separates the detected names into first, middle and last names.
        Returns in form of two lists entity_value and original_text
        Args:
            replaced_text: text in which names detected from TextDetector are replaced by
        _<name>_
        Example:
                replaced_text = My name is _yash_ _modi_
        Returns:
                [{first_name: "yash", middle_name: None, last_name: "modi"}], [ "yash modi"]
        """

        original_text, entity_value = [], []
        name_list = []
        name_holder = []

        for each in replaced_text:
            if each.startswith('_') and each.endswith('_'):
                name_holder.append(each.replace('_', ''))

            else:
                if name_holder:
                    name_list.append(name_holder)
                    name_holder = []

        if name_holder:
            name_list.append(name_holder)

        for name in name_list:
            name_entity_value, original_text_value = self.get_format_name(name)
            original_text.extend(original_text_value)
            entity_value.extend(name_entity_value)

        return entity_value, original_text

    def context_check_botmessage(self, botmessage):
        """
        Checks if previous botmessage conatins name as a keyword or not
        Args:
            botmessage: it consists of the previous botmessage
            Example: what is your name ?
        Returns:
            True
        """

        regex_pattern = re.compile(r'[\|\,+\:\?\!\"\(\)!\'\.\%\[\]]+')
        botmessage = regex_pattern.sub(r'', botmessage)

        botmessage = " " + botmessage.lower().strip() + " "
        for variant in NAME_VARIATIONS:
            if " " + variant + " " in botmessage:
                return True
        return False

    def get_hindi_names_from_regex(self, text):
        """
        This method is used to detect hindi names which obey the regexes
        Args:
            text (str): text from which hindi names obeying the regex have to be extracted

        Returns:
            detect_text_lists (tuple): two dimensional tuple
            1. text_list (list): representing the detected text
            2. text_list (list): representing the original text

        Examples:
            text = u'मेरा नाम प्रतिक श्रीदत्त जयराओ है'
            get_hindi_text_from_regex(text=text)
            >>([u'प्रतिक', u'श्रीदत्त', u'जयराओ'], [u'प्रतिक', u'श्रीदत्त', u'जयराओ'])

        """
        text_list = self.get_hindi_text_from_regex(text=text)

        detected_names = []
        if text_list:
            for each in text_list:
                if each:
                    detected_names.extend(each.split())

        text_list = detected_names
        return text_list, text_list

    def get_hindi_names_without_regex(self, text):
        """
        This method is used to get detect hindi names without any regex pattern (This method is called only if
        detection from regex patterns fails)
        Args:
            text (str): the text from which hindi text has to be detected
        Returns:
            person_name (tuple): two dimensional tuple
            1. entity_value (list): representing the entity values of names
            2. original_text (list): representing the original text detected


        Example:
            text = u'प्रतिक श्रीदत्त जयराओ'
            get_hindi_names_without_regex(text=text)

            >> [{first_name: u"प्रतिक", middle_name: u"श्रीदत्त", last_name: u"जयराओ"}], [ u'प्रतिक श्रीदत्त जयराओ']

        """

        text = self.replace_stopwords_hindi(text)
        original_text_list = text.strip().split()
        if len(original_text_list) > 4:
            original_text_list = []
        replaced_text = self.replace_detected_text(
            (original_text_list, original_text_list), text=text)
        return self.detect_person_name_entity(replaced_text=replaced_text)

    def get_hindi_text_from_regex(self, text):
        """
        This method is used to detect hindi names using regexes from the given text
        Args:
            text (str): text from which hindi names which follow the regex pattern have to be extracted

        Returns:
            pattern_match (list): list consisting of detected words

        Examples:
            text = u'मेरा नाम प्रतिक श्रीदत्त जयराओ है'
            get_hindi_text_from_regex(text=text)
            >>[u'प्रतिक श्रीदत्त जयराओ']

        """
        regex_list = [
            ur"(?:मुझे|हमें|मुझको|हमको|हमे)\s+(?:लोग)\s+([\u0900-\u097F\s]+)"
            ur"\s+(?:नाम\sसे)\s+(?:कहते|बुलाते|बुलाओ)",
            ur"(?:नाम|मैं|हम|मै)\s+([\u0900-\u097F\s]+)",
            ur"(?:मुझे|हमें|मुझको|हमको|हमे)\s+([\u0900-\u097F\s]+)(?:कहते|बुलाते|बुलाओ)",
            ur"\s*([\u0900-\u097F\s]+)(?:मुझे|मैं|मै)(?:कहते|बुलाते|बुलाओ)?"
        ]

        for regex in regex_list:
            regex_ = re.compile(regex, re.U)
            pattern_match = regex_.findall(text)
            pattern_match = [
                self.replace_stopwords_hindi(x) for x in pattern_match if x
            ]
            if pattern_match:
                if pattern_match[0]:
                    return pattern_match

        return None

    def replace_stopwords_hindi(self, text):
        """
        This method is used to replace hindi stop words from the text
        Args:
            text (str): The text from which hindi stop words have to be removed

        Returns:
            clean_text (str): text from which hindi stop words have been removed
        """
        split_list = text.split(" ")
        split_list = [
            word for word in split_list if word not in HINDI_STOPWORDS
        ]
        if split_list:
            return " ".join(split_list)

        return ""

    def detect_abusive_phrases_hindi(self, text):
        """
        This method is used to check for hindi abuses in the sentence
        Args:
            text (str): text in which abuses have to be checked

        Returns:
            status (bool): returns if the text consists of abuses
        """
        text = ' ' + text + ' '
        for abuse in HINDI_BADWORDS:
            if ' ' + abuse + ' ' in text:
                return True
        return False

    def remove_emojis(self, text):
        """
        This method is used to remove emojis from the given text
        Args:
            text (str): the text from which the emojis have to be removed
        Returns:
            text (str): text with emojis replaced with ''
        """
        emoji_pattern = re.compile(
            ur'[{0}]+'.format(''.join(EMOJI_RANGES.values())), re.UNICODE)
        text = emoji_pattern.sub(repl='', string=text)
        return text

    def detect_question_hindi(self, text):
        """
        This method is used to detect if the given text has a hindi question present in it
        Args:
            text (str): the text for which the question check has to be run

        Returns:
            status (bool): returns if the text has a question in it
        """
        for word in text.split():
            if word in HINDI_QUESTIONWORDS:
                return True
        return False

    def _update_processed_text(self, person_name_list):
        """
        Replaces detected date with tag generated from entity_name used to initialize the object with

        A final string with all dates replaced will be stored in object's tagged_text attribute
        A string with all dates removed will be stored in object's processed_text attribute

        Args:
            person_name_list (list): list of substrings of original text to be replaced with tag
                                       created from entity_name
        """
        for detected_text in person_name_list:
            self.tagged_text = self.tagged_text.replace(
                detected_text, self.tag)
            self.processed_text = self.processed_text.replace(
                detected_text, '')
示例#21
0
def get_text(message, entity_name, structured_value, fallback_value, bot_message):
    """This functionality calls the TextDetector class to detect textual entities

    Attributes:
        NOTE: Explained above

    Output:
        NOTE: Explained above

    For Example:

        message = 'i want to order chinese from  mainland china and pizza from domminos'
        entity_name = 'restaurant'
        structured_value = None
        fallback_value = None
        bot_message = None
        output = get_text(message=message, entity_name=entity_name, structured_value=structured_value,
                          fallback_value=fallback_value, bot_message=bot_message)
        print output

            >> [{'detection': 'message', 'original_text': 'mainland china', 'entity_value':
            {'value': u'Mainland China'}}, {'detection': 'message', 'original_text': 'domminos',
            'entity_value': {'value': u"Domino's Pizza"}}]



        message = 'i wanted to watch movie'
        entity_name = 'movie'
        structured_value = 'inferno'
        fallback_value = None
        bot_message = None
        output = get_text(message=message, entity_name=entity_name, structured_value=structured_value,
                          fallback_value=fallback_value, bot_message=bot_message)
        print output

            >> [{'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value':
            {'value': u'Inferno'}}]


        message = 'i wanted to watch inferno'
        entity_name = 'movie'
        structured_value = 'delhi'
        fallback_value = None
        bot_message = None
        output = get_text(message=message, entity_name=entity_name, structured_value=structured_value,
                          fallback_value=fallback_value, bot_message=bot_message)
        print output

            >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}]

    """
    text_detection = TextDetector(entity_name=entity_name)
    if structured_value:
        text_entity_list, original_text_list = text_detection.detect_entity(structured_value)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED)
        else:
            return output_entity_dict_value(structured_value, structured_value, FROM_STRUCTURE_VALUE_NOT_VERIFIED)
    else:
        text_entity_list, original_text_list = text_detection.detect_entity(message)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE)
        elif fallback_value:
            return output_entity_dict_value(fallback_value, fallback_value, FROM_FALLBACK_VALUE)

    return None
示例#22
0
class BudgetDetector(BaseDetector):
    """Detects budget from the text  and tags them.

    Detects the budget from the text and replaces them by entity_name.
    This detection logic first checks for budget using regular expressions and also uses TextDetector class to extract
    data in textual format (i.e. Hundred, Thousand, etc).

    This detector captures  additional attributes like max_budget, min_budget whether the budget is
    normal_budget (detected through regex) or text_budget (detected through text detection)

    For Example:

        budget_detection = BudgetDetector('budget')
        message = "shirts between 2000 to 3000"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}]  --  ['2000 to 3000']
            Tagged text:  shirts between __budget__

        budget_detection = BudgetDetector('budget')
        message = "tshirts less than 2k"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': 0}]  --  ['less than 2k']
            Tagged text:  tshirts __budget__

        budget_detection = BudgetDetector('budget')
        message = "tshirts greater than 2k"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}]  --  ['greater than 2k']
            Tagged text:  tshirts __budget__

        budget_detection = BudgetDetector('budget')
        message = "jeans of Rs. 1000"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}]  --  ['rs. 1000']
            Tagged text:  ' jeans of __budget__ '


    Attributes:
        min_digit: minimum digit that a budget can take by default it is set to 2. So, the NER will detect number as
        budget if its greater then 9
        max_digit: maximum digit that buget can take by default it is set to 5. So, the NER will detect number
        as budget if its less than 99999
        text: string to extract entities from
        entity_name: string by which the detected size would be replaced with on calling detect_entity()
        tagged_text: string with size replaced with tag defined by entity name
        processed_text: string with sizes detected removed
        budget: list of budgets detected
        original_budget_text: list to store substrings of the text detected as budget
        tag: entity_name prepended and appended with '__'
        regex_object: regex object that is used to substitute k with 000 i.e. if text contains 2k then
        it will be substituted as 2000
        text_detection_object: text detection object to detect text in Textual format
        
    Note:
        text and tagged_text will have a extra space prepended and appended after calling detect_entity(text)

    """
    def __init__(self,
                 entity_name,
                 source_language_script=ENGLISH_LANG,
                 translation_enabled=False):
        """Initializes a BudgetDetector object

        Args:
            entity_name: A string by which the detected budget would be replaced with on calling detect_entity()
        """

        # assigning values to superclass attributes
        self._supported_languages = [ENGLISH_LANG]
        super(BudgetDetector, self).__init__(source_language_script,
                                             translation_enabled)

        self.min_digit = 2
        self.max_digit = 5
        self.entity_name = entity_name

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.budget = []
        self.original_budget_text = []
        self.unit_present_list = ['k', 'l', 'm', 'c', 'h', 'th']
        regx_for_units = [(r'([\d,.]+)\s*k', 1000), (r'([\d,.]+)\s*h', 1000),
                          (r'([\d,.]+)\s*th', 1000),
                          (r'([\d,.]+)\s*l', 100000),
                          (r'([\d,.]+)\s*lacs?', 100000),
                          (r'([\d,.]+)\s*lakh?', 100000),
                          (r'([\d,.]+)\s*lakhs?', 100000),
                          (r'([\d,.]+)\s*m', 1000000),
                          (r'([\d,.]+)\s*million', 1000000),
                          (r'([\d,.]+)\s*mill?', 1000000),
                          (r'([\d,.]+)\s*c', 10000000),
                          (r'([\d,.]+)\s*cro?', 10000000),
                          (r'([\d,.]+)\s*crore?', 10000000),
                          (r'([\d,.]+)\s*crores?', 10000000)]
        self.regex_object = RegexReplace(regx_for_units)
        self.tag = '__' + self.entity_name + '__'
        self.text_detection_object = TextDetector(entity_name=entity_name)

    def detect_entity(self, text):
        """Detects budget in the text string

        Args:
            text: string to extract entities from

        Returns:
            A tuple of two lists with first list containing the detected budgets and second list containing their
            corresponding substrings in the original message.

            For example:

                ([{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}], ['rs. 1000'])

            Additionally this function assigns these lists to self.budget and self.original_budget_text attributes
            respectively.

        """
        self.text = ' ' + text + ' '
        self.processed_text = self.text.lower()
        self.tagged_text = self.text
        budget_data = self._detect_budget()
        self.budget = budget_data[0]
        self.original_budget_text = budget_data[1]
        return budget_data

    @property
    def supported_languages(self):
        return self._supported_languages

    def _detect_budget(self):
        """Detects budget in the self.text

        Returns:
            A tuple of two lists with first list containing the detected budgets and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "shirts between 2000 to 3000"
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """

        budget_list = []
        original_list = []
        budget_list, original_list = self._detect_min_max_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_min_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_max_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_any_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        if not budget_list:
            budget_list, original_list = self._detect_text_budget(
                budget_list, original_list)
            self._update_processed_text(original_list)

        return budget_list, original_list

    def _detect_min_budget(self, budget_list=None, original_list=None):
        """Detects minimum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "tshirts greater than 2k"
                output: ([{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}], ['greater than 2k'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []
        patterns = re.findall(
            r'(\s(above|more? than|more?|greater than|greater|abv|abov|more? den|\>\s*\=?)\s+'
            r'(rs.|rs|rupees|rupee)*\s*([\d.,]+\s*[klmct]?[a-z]*|[\d.,]+\s*[klmct]?[a-z]*)\s*'
            r'(rs.|rs|rupees|rupee|\.)?\s)', self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            if any([unit in pattern[3] for unit in self.unit_present_list]):
                replace_comma = re.sub(',', '', pattern[3])
                amount = int(self.regex_object.unit_substitute(replace_comma))
            else:
                replace_comma = re.sub(',', '', pattern[3])
                amount = int(replace_comma)

            if self.min_digit <= len(str(amount)) <= self.max_digit:
                budget['min_budget'] = amount
                budget_list.append(budget)
                original_list.append(original)

        return budget_list, original_list

    def _detect_max_budget(self, budget_list=None, original_list=None):
        """Detects maximum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "tshirts less than 2k"
                output: ([{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': }], ['less than 2k'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'(\s(max|upto|o?nly|around|below|less than|less|less den|\<\s*\=?)\s+(rs.|rs|rupees|rupee)'
            r'?\s*([\d.,]+\s*[klmct]?[a-z]*|[\d.,]+\s*[klmct]?[a-z]*)\s*(rs.|rs|rupees|rupee|\.)?\s)',
            self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()

            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            if any([unit in pattern[3] for unit in self.unit_present_list]):
                comma_removed_unit_text = pattern[3].replace(',', '')
                amount = int(
                    self.regex_object.unit_substitute(comma_removed_unit_text))
            else:
                comma_removed_number = pattern[3].replace(',', '')
                amount = int(comma_removed_number)

            if self.min_digit <= len(str(amount)) <= self.max_digit:
                budget['max_budget'] = amount
                budget_list.append(budget)
                original_list.append(original)

        return budget_list, original_list

    def _detect_min_max_budget(self, budget_list=None, original_list=None):
        """Detects both minimum and maximum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: shirts between 2000 to 3000
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """
        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'(\s(([\d,.]+\s*[klmct]?[a-z]*)|([\d,.]+\s*[klmct]?[a-z]*))\s*(\-|to|and)\s*'
            r'(([\d,.]+\s*[klmct]?[a-z]*)|([\d,.]+\s*[klmct]?[a-z]*))\.?\s)',
            self.processed_text.lower())
        for pattern in patterns:
            original = None
            pattern = list(pattern)
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            flag_contains_k = False
            max_budget = 0
            min_budget = 0
            _min_budget = 0
            if pattern[6]:
                if any([unit in pattern[6]
                        for unit in self.unit_present_list]):
                    flag_contains_k = True
                else:
                    flag_contains_k = False
                comma_removed_unit_text = pattern[6].replace(',', '')
                max_budget = int(
                    self.regex_object.unit_substitute(comma_removed_unit_text))
            elif pattern[7]:
                comma_removed_number = pattern[7].replace(',', '')
                max_budget = int(comma_removed_number)
                min_budget = 0

            if pattern[2]:
                _comma_removed_unit_text = pattern[2].replace(',', '')
                _min_budget = int(
                    self.regex_object.unit_substitute(
                        _comma_removed_unit_text))
                if flag_contains_k:
                    for u in self.unit_present_list:
                        if u in pattern[6]:
                            pattern[2] = str(pattern[2]).strip() + u
                            break
                comma_removed_unit_text = pattern[2].replace(',', '')
                min_budget = int(
                    self.regex_object.unit_substitute(comma_removed_unit_text))
            elif pattern[3]:
                comma_removed_number = pattern[3].replace(',', '')
                min_budget = int(comma_removed_number)
            if min_budget > max_budget:
                min_budget = _min_budget
            min_budget = min_budget if self.min_digit <= len(
                str(min_budget)) <= self.max_digit else 0
            max_budget = max_budget if self.min_digit <= len(
                str(max_budget)) <= self.max_digit else 0
            if min_budget != 0 and max_budget != 0 and min_budget <= max_budget:
                original = pattern[0].strip()
                budget['min_budget'] = min_budget
                budget['max_budget'] = max_budget

                budget_list.append(budget)
                original_list.append(original)
        return budget_list, original_list

    def _detect_any_budget(self, budget_list=None, original_list=None):
        """Detects a budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: shirts between 2000 to 3000
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        text = self.processed_text.lower().strip()

        units_patterns = [
            r'k|hazaa?r|haja?ar|thousand', r'l|lacs?|lakh?|lakhs?',
            r'm|million|mill?', r'cro?|cror?|crore?|crores?'
        ]
        units_order = [1e3, 1e5, 1e6, 1e7]
        full = re.compile(
            r'((rs.|rs|rupees|rupee)?\s*((\d+((\,|\.)\d+)+)|(0|[1-9]\d*)?(\.\d+)?(?<=\d))'
            r'\s*(' + r'|'.join(units_patterns) +
            r')?\s*(rs.|rs|rupees|rupee)?)\b')
        units_patterns = map(lambda s: '^' + s, units_patterns)
        units_patterns = map(re.compile, units_patterns)
        matches = full.findall(text)
        for match in matches:
            original = match[0].strip()
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }
            amount, unit = match[2], match[-2]
            if not amount:
                continue
            amount = amount.replace(',', '')
            _amount = amount.split('.')
            if len(_amount) > 1:
                amount = ''.join(_amount[:-1]) + '.' + _amount[-1]
            amount = float(amount)
            for i, pattern in enumerate(units_patterns):
                if pattern.findall(unit):
                    amount = int(amount * units_order[i])
                    break
            amount = int(amount)
            if self.min_digit <= len(str(amount)) <= self.max_digit:
                budget['max_budget'] = amount
                budget_list.append(budget)
                original_list.append(original)

        return budget_list, original_list

    def _detect_text_budget(self, budget_list=None, original_list=None):
        """Detects budget  from text using text detection logic i.e.TextDetector
        This is a function which will be called when we want to detect the budget using text

        Returns:
            A tuple of two lists with first list containing the detected numbers and second list containing their
            corresponding substrings in the original message.

        """
        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        budget_text_list, original_text_list = self.text_detection_object.detect_entity(
            self.text)
        self.tagged_text = self.text_detection_object.tagged_text
        self.processed_text = self.text_detection_object.processed_text
        count = 0
        while count < len(original_text_list):
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_TEXT
            }

            budget_list.append(budget)
            count += 1

        return budget_list, original_list

    def _update_processed_text(self, original_budget_strings):
        """
        Replaces detected budgets with self.tag generated from entity_name used to initialize the object with

        A final string with all budgets replaced will be stored in self.tagged_text attribute
        A string with all budgets removed will be stored in self.processed_text attribute

        Args:
            original_budget_strings: list of substrings of original text to be replaced with self.tag
        """
        for detected_text in original_budget_strings:
            if detected_text:
                self.tagged_text = self.tagged_text.replace(
                    detected_text, self.tag)
                self.processed_text = self.processed_text.replace(
                    detected_text, '')

    def set_min_max_digits(self, min_digit, max_digit):
        """
        Update min max digit

        Args:
            min_digit (int): min digit
            max_digit (int): max digit
        """
        self.min_digit = min_digit
        self.max_digit = max_digit
示例#23
0
class BudgetDetector(object):
    """Detects budget from the text  and tags them.

    Detects the budget from the text and replaces them by entity_name.
    This detection logic first checks for budget using regular expressions and also uses TextDetector class to extract
    data in textual format (i.e. Hundred, Thousand, etc).

    This detector captures  additional attributes like max_budget, min_budget whether the budget is
    normal_budget (detected through regex) or text_budget (detected through text detection)

    For Example:

        budget_detection = BudgetDetector('budget')
        message = "shirts between 2000 to 3000"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}]  --  ['2000 to 3000']
            Tagged text:  shirts between __budget__

        budget_detection = BudgetDetector('budget')
        message = "tshirts less than 2k"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': 0}]  --  ['less than 2k']
            Tagged text:  tshirts __budget__

        budget_detection = BudgetDetector('budget')
        message = "tshirts greater than 2k"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}]  --  ['greater than 2k']
            Tagged text:  tshirts __budget__

        budget_detection = BudgetDetector('budget')
        message = "jeans of Rs. 1000"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}]  --  ['rs. 1000']
            Tagged text:  ' jeans of __budget__ '


    Attributes:
        min_digit: minimum digit that a budget can take by default it is set to 2. So, the NER will detect number as
        budget if its greater then 9
        max_digit: maximum digit that buget can take by default it is set to 5. So, the NER will detect number
        as budget if its less than 99999
        text: string to extract entities from
        entity_name: string by which the detected size would be replaced with on calling detect_entity()
        tagged_text: string with size replaced with tag defined by entity name
        processed_text: string with sizes detected removed
        budget: list of budgets detected
        original_budget_text: list to store substrings of the text detected as budget
        tag: entity_name prepended and appended with '__'
        regex_object: regex object that is used to substitute k with 000 i.e. if text contains 2k then
        it will be substituted as 2000
        text_detection_object: text detection object to detect text in Textual format
        
    Note:
        text and tagged_text will have a extra space prepended and appended after calling detect_entity(text)

    """
    def __init__(self, entity_name):
        """Initializes a BudgetDetector object

        Args:
            entity_name: A string by which the detected budget would be replaced with on calling detect_entity()
        """

        self.min_digit = 2
        self.max_digit = 5
        self.entity_name = entity_name

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.budget = []
        self.original_budget_text = []

        regex_for_thousand = [(r'(\d+)k', r'\g<1>000')]
        self.regex_object = Regex(regex_for_thousand)
        self.tag = '__' + self.entity_name + '__'
        self.text_detection_object = TextDetector(entity_name=ES_BUDGET_LIST)

    def detect_entity(self, text):
        """Detects budget in the text string

        Args:
            text: string to extract entities from

        Returns:
            A tuple of two lists with first list containing the detected budgets and second list containing their
            corresponding substrings in the original message.

            For example:

                ([{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}], ['rs. 1000'])

            Additionally this function assigns these lists to self.budget and self.original_budget_text attributes
            respectively.

        """
        self.text = ' ' + text + ' '
        self.processed_text = self.text.lower()
        self.tagged_text = self.text
        budget_data = self._detect_budget()
        self.budget = budget_data[0]
        self.original_budget_text = budget_data[1]
        return budget_data

    def _detect_budget(self):
        """Detects budget in the self.text

        Returns:
            A tuple of two lists with first list containing the detected budgets and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "shirts between 2000 to 3000"
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """

        budget_list = []
        original_list = []
        budget_list, original_list = self._detect_min_max_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_min_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_max_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_any_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_text_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)

        return budget_list, original_list

    def _detect_min_budget(self, budget_list=None, original_list=None):
        """Detects minimum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "tshirts greater than 2k"
                output: ([{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}], ['greater than 2k'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []
        patterns = re.findall(
            r'(\s(above|more? than|more?|greater than|greater|abv|abov|more? den|\>\s*\=?)\s+'
            r'(rs.|rs|rupees|rupee)*\s*(\d{' + str(self.min_digit) + ',' +
            str(self.max_digit) + '}|\d{1,' +
            str(self.max_digit - 3) + '}\s*k)\s*(rs.|rs|rupees|rupee|\.)?\s)',
            self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            if 'k' in pattern[3]:
                budget['min_budget'] = int(
                    self.regex_object.text_substitute(pattern[3]))
            else:
                budget['min_budget'] = int(pattern[3])

            budget_list.append(budget)
            original_list.append(original)
        return budget_list, original_list

    def _detect_max_budget(self, budget_list=None, original_list=None):
        """Detects maximum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "tshirts less than 2k"
                output: ([{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': }], ['less than 2k'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'(\s(max|upto|o?nly|around|below|less than|less|less den|\<\s*\=?)\s+(rs.|rs|rupees|rupee)?\s*(\d{'
            + str(self.min_digit) + ',' + str(self.max_digit) + '}|\d{1,' +
            str(self.max_digit - 3) + '}\s*k)\s*(rs.|rs|rupees|rupee|\.)?\s)',
            self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()

            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            if 'k' in pattern[3]:
                budget['max_budget'] = int(
                    self.regex_object.text_substitute(pattern[3]))
            else:
                budget['max_budget'] = int(pattern[3])

            budget_list.append(budget)
            original_list.append(original)
        return budget_list, original_list

    def _detect_min_max_budget(self, budget_list=None, original_list=None):
        """Detects both minimum and maximum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: shirts between 2000 to 3000
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """
        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'(\s((\d{1,' + str(self.max_digit - 3) + '}\s*k?)|(\d{' +
            str(self.min_digit) + ',' + str(self.max_digit) +
            '}))\s*(\-|to|and)\s*((\d{1,' + str(self.max_digit - 3) +
            '}\s*k?)|(\d{' + str(self.min_digit) + ',' + str(self.max_digit) +
            '}))\.?\s)', self.processed_text.lower())
        for pattern in patterns:
            original = None
            pattern = list(pattern)
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            flag_contains_k = False
            max_budget = 0
            if pattern[6]:
                flag_contains_k = True if 'k' in pattern[6] else False
                max_budget = int(self.regex_object.text_substitute(pattern[6]))
            elif pattern[7]:
                max_budget = int(pattern[7])
            min_budget = 0
            if pattern[2]:
                if flag_contains_k and 'k' not in pattern[2]:
                    pattern[2] = str(pattern[2]).strip() + 'k'
                min_budget = int(self.regex_object.text_substitute(pattern[2]))
            elif pattern[3]:
                min_budget = int(pattern[3])
            min_budget = min_budget if self.min_digit <= min_budget.__str__(
            ).__len__() <= self.max_digit else 0
            max_budget = max_budget if self.min_digit <= max_budget.__str__(
            ).__len__() <= self.max_digit else 0
            if min_budget != 0 and max_budget != 0 and min_budget <= max_budget:
                original = pattern[0].strip()
                budget['min_budget'] = min_budget
                budget['max_budget'] = max_budget

                budget_list.append(budget)
                original_list.append(original)
        return budget_list, original_list

    def _detect_any_budget(self, budget_list=None, original_list=None):
        """Detects a budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: shirts between 2000 to 3000
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'\s((rs.|rs|rupees|rupee)?\s?(\d{' + str(self.min_digit) + ',' +
            str(self.max_digit) + '}|\d{1,' +
            str(self.max_digit - 3) + '}\s*k)\s?(rs.|rs|rupees|rupee)?\.?)\s',
            self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }
            if 'k' in pattern[2]:
                budget['max_budget'] = int(
                    self.regex_object.text_substitute(pattern[2]))
            else:
                budget['max_budget'] = int(pattern[2])
            budget_list.append(budget)
            original_list.append(original)
        return budget_list, original_list

    def _detect_text_budget(self, budget_list=None, original_list=None):
        """Detects budget  from text using text detection logic i.e.TextDetector
        This is a function which will be called when we want to detect the budget using text

        Returns:
            A tuple of two lists with first list containing the detected numbers and second list containing their
            corresponding substrings in the original message.

        """
        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        budget_text_list, original_text_list = self.text_detection_object.detect_entity(
            self.text)
        self.tagged_text = self.text_detection_object.tagged_text
        self.processed_text = self.text_detection_object.processed_text
        count = 0
        while count < len(original_text_list):
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_TEXT
            }

            budget_list.append(budget)
            count += 1
        if original_text_list:
            original_list.extend(original_text_list)
        return budget_list, original_list

    def _update_processed_text(self, original_budget_strings):
        """
        Replaces detected budgets with self.tag generated from entity_name used to initialize the object with

        A final string with all budgets replaced will be stored in self.tagged_text attribute
        A string with all budgets removed will be stored in self.processed_text attribute

        Args:
            original_budget_strings: list of substrings of original text to be replaced with self.tag
        """
        for detected_text in original_budget_strings:
            if detected_text:
                self.tagged_text = self.tagged_text.replace(
                    detected_text, self.tag)
                self.processed_text = self.processed_text.replace(
                    detected_text, '')

    def min_max_digit(self, min_digit, max_digit):
        self.min_digit = min_digit
        self.max_digit = max_digit
示例#24
0
class ShoppingSizeDetector(object):
    """Detects size which are used for shopping from the text  and tags them.

    Detects the sizes from the text and replaces them by entity_name.
    This detection logic will first check if text contains size in textual format (i.e. Large, XL, X-Large, etc)
    for this we call TextDetector class and then we have regex that will identify integer from the text

    For Example:

        size_detector = ShoppingSizeDetector("shopping_size")
        message = "Suggest me Medium size tshirt and jeans of 34 waist"
        size, original_numbers = size_detector.detect_entity(message)
        tagged_text = size_detector.tagged_text
        print size, ' -- ', original_numbers
        print 'Tagged text: ', tagged_text

         >> ['M','34'] -- ['Medium','34']
            Tagged text: Suggest me __shopping_size__ size tshirt and jeans of __shopping_size__ waist



    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected size would be replaced with on calling detect_entity()
        dictionary_name: name of a dictionary that stores the string data. For example: XL, L, Large, etc.
        This is constant and its value is size_detector
        tagged_text: string with size replaced with tag defined by entity name
        processed_text: string with sizes detected removed
        size: list of sizes detected
        original_size_text: list to store substrings of the text detected as size
        tag: entity_name prepended and appended with '__'

    Note:
        text and tagged_text will have a extra space prepended and appended after calling detect_entity(text)
    """
    def __init__(self, entity_name):
        """Initializes a ShoppingSizeDetector object

        Args:
            entity_name: A string by which the detected numbers would be replaced with on calling detect_entity()
        """
        self.entity_name = entity_name
        self.dictionary_name = 'shopping_size'
        self.text = ''
        self.text_dict = {}
        self.tagged_text = ''
        self.processed_text = ''
        self.size = []
        self.original_size_text = []
        self.text_detection_object = TextDetector(entity_name=dictionary_name)
        self.tag = '__' + self.entity_name + '__'

    def detect_entity(self, text):
        """Detects size in the text string

        Args:
            text: string to extract entities from

        Returns:
            A tuple of two lists with first list containing the detected sizes and second list containing their
            corresponding substrings in the original message.

            For example:

                (['XL','M','30'], [''X-Large','Medium','30'])

            Additionally this function assigns these lists to self.size and self.original_size_text attributes
            respectively.

        """
        self.text = ' ' + text + ' '
        self.processed_text = self.text
        self.tagged_text = self.text

        size_data = self._detect_size()
        self.size = size_data[0]
        self.original_size_text = size_data[1]
        return size_data

    def _detect_size(self):
        """Detects size in the self.text

        Returns:
            A tuple of two lists with first list containing the detected sizes and second list containing their
            corresponding substrings in the original message.

            For example:
                input: Show me X-Large and Medium size tshirt and jeans of waist 34
                output: (['XL','M', 34], ['X-Large', 'Medium', 34])

        """
        size_list = []
        original_list = []
        size_list, original_list = self._detect_size_from_text(
            size_list, original_list)
        self._update_processed_text(original_list)

        size_list, original_list = self._detect_size_from_regex(
            size_list, original_list)
        self._update_processed_text(original_list)
        return size_list, original_list

    def _detect_size_from_text(self, size_list=None, original_list=None):
        """Detects any size  from text using text detection logic i.e.TextDetector
        This is a function which will be called when we want to detect the size using text

        Returns:
            A tuple of two lists with first list containing the detected numbers and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "Suggest me shirt of size X-Large"
                output: (['XL'], ['X-Large'])

        """
        if size_list is None:
            size_list = []
        if original_list is None:
            original_list = []

        size_list, original_list = self.text_detection_object.detect_entity(
            self.text)
        self.tagged_text = self.text_detection_object.tagged_text
        self.processed_text = self.text_detection_object.processed_text
        return size_list, original_list

    def _detect_size_from_regex(self, size_list=None, original_list=None):
        """Detects any size  from text using regex
        This is a function which will be called when we want to detect the size using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected numbers and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "Suggest me shirt of size 30"
                output: (['30'], ['30'])

        """

        if size_list is None:
            size_list = []
        if original_list is None:
            original_list = []

        pattern = re.search(r'(\s\d{1,2}\s)', self.processed_text.lower())
        if pattern:
            size_list.append(pattern.group(0).strip())
            original_list.append(pattern.group(0).strip())
        return size_list, original_list

    def _update_processed_text(self, original_size_strings):
        """
        Replaces detected sizes with self.tag generated from entity_name used to initialize the object with

        A final string with all sizes replaced will be stored in self.tagged_text attribute
        A string with all sizes removed will be stored in self.processed_text attribute

        Args:
            original_size_strings: list of substrings of original text to be replaced with self.tag
        """
        for detected_text in original_size_strings:
            self.tagged_text = self.tagged_text.replace(
                detected_text, self.tag)
            self.processed_text = self.processed_text.replace(
                detected_text, '')
def get_text(message, entity_name, structured_value, fallback_value, bot_message):
    """Use TextDetector (elasticsearch) to detect textual entities

    Args:
        message (str): natural text on which detection logic is to be run. Note if structured value is
                                detection is run on structured value instead of message
        entity_name (str): name of the entity. Also acts as elastic-search dictionary name
                           if entity uses elastic-search lookup
        structured_value (str): Value obtained from any structured elements. Note if structured value is
                                detection is run on structured value instead of message
                                (For example, UI elements like form, payload, etc)
        fallback_value (str): If the detection logic fails to detect any value either from structured_value
                          or message then we return a fallback_value as an output.
        bot_message (str): previous message from a bot/agent.


    Returns:
        dict or None: dictionary containing entity_value, original_text and detection;
                      entity_value is in itself a dict with its keys varying from entity to entity

    Example:

        message = 'i want to order chinese from  mainland china and pizza from domminos'
        entity_name = 'restaurant'
        structured_value = None
        fallback_value = None
        bot_message = None
        output = get_text(message=message, entity_name=entity_name, structured_value=structured_value,
                          fallback_value=fallback_value, bot_message=bot_message)
        print output

            >> [{'detection': 'message', 'original_text': 'mainland china', 'entity_value':
            {'value': u'Mainland China'}}, {'detection': 'message', 'original_text': 'domminos',
            'entity_value': {'value': u"Domino's Pizza"}}]



        message = 'i wanted to watch movie'
        entity_name = 'movie'
        structured_value = 'inferno'
        fallback_value = None
        bot_message = None
        output = get_text(message=message, entity_name=entity_name, structured_value=structured_value,
                          fallback_value=fallback_value, bot_message=bot_message)
        print output

            >> [{'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value':
            {'value': u'Inferno'}}]


        message = 'i wanted to watch inferno'
        entity_name = 'movie'
        structured_value = 'delhi'
        fallback_value = None
        bot_message = None
        output = get_text(message=message, entity_name=entity_name, structured_value=structured_value,
                          fallback_value=fallback_value, bot_message=bot_message)
        print output

            >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}]

    """
    text_detection = TextDetector(entity_name=entity_name)
    if structured_value:
        text_entity_list, original_text_list = text_detection.detect_entity(structured_value)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED)
        else:
            return output_entity_dict_list([structured_value], [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED)
    else:
        text_entity_list, original_text_list = text_detection.detect_entity(message)
        if text_entity_list:
            return output_entity_dict_list(text_entity_list, original_text_list, FROM_MESSAGE)
        elif fallback_value:
            return output_entity_dict_list([fallback_value], [fallback_value], FROM_FALLBACK_VALUE)

    return None