def _get_entity_substring_from_text(self, text, variant, entity_name):
        """
            Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance
            and return the closest substring in the text that matches the variant.
            For each entity fuziness and min_token_size_for_fuzziness is used from the entity details.
            Args:
              variant(str or unicode): string, ngram of variant to fuzzy detect in the text using
                                       Levenshtein distance
              text(str or unicode): sentence from self.processed on which detection is being done
              entity_name (str): name of the entity to get fuzziness and min_token_lenght value
            Returns:
              str or unicode or None: part of the given text that was detected as entity given the variant,
                                      None otherwise
            Example:
              >>> text_detector = TextDetector(entity_dict={'city':{})
              >>> text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
              >>> text_detector._get_entity_substring_from_text(variant='chennai')
              'chennai'
              >>> text_detector._get_entity_substring_from_text(variant='delhi')
              'delehi'
        """
        variant_tokens = TOKENIZER.tokenize(variant)
        text_tokens = TOKENIZER.tokenize(text)
        original_text_tokens = []
        variant_token_i = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_token_i]
            same = variant_token == text_token

            # get fuzziness and min_token_size_for_fuziness value from entity dict
            entity_dict = self.entities_dict.get(entity_name, {})

            # get fuzziness from entity if not set default
            fuzziness = entity_dict.get('fuzziness') or self._fuzziness

            self.set_fuzziness_low_high_threshold(fuzziness)

            min_token_size_for_fuzziness = entity_dict.get(
                'min_token_len_fuzziness')

            if not min_token_size_for_fuzziness:
                min_token_size_for_fuzziness = self._min_token_size_for_fuzziness

            ft = self._get_fuzziness_threshold_for_token(token=text_token)

            # set substitution cost to one
            if same or (len(text_token) > min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          substitution_cost=1,
                                          max_distance=ft + 1) <= ft):
                original_text_tokens.append(text_token)
                variant_token_i += 1
                if variant_token_i == len(variant_tokens):
                    return self._get_substring_from_processed_text(
                        text, original_text_tokens)
            else:
                original_text_tokens = []
                variant_token_i = 0
        return None
예제 #2
0
    def _get_entity_from_text(self, variant, text):
        """
        Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance

        Args:
            variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text: text to detect entities from

        Returns:
            part of the given text that was detected as entity given the variant, None otherwise

        Example:
            text_detection = TextDetector('city')
            ...
            text_detection._get_entity_from_text(self, variant, text)
            text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            text_detection.get_entity_from_text('chennai', text)

            Output:
                'chennai'

            text_detection.get_entity_from_text('Delhi', text)

            Output:
                'delehi'
        """
        variant_tokens = tokenizer.tokenize(variant.lower())
        text_tokens = tokenizer.tokenize(text.lower())
        original_text = []
        variant_count = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_count]

            utext_token = text_token
            if type(utext_token) == 'str':
                utext_token = utext_token.decode('utf-8')

            same = variant_token == text_token
            ft = self._get_fuzziness_threshold_for_token(utext_token)
            if same or (len(utext_token) > self._min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          max_distance=ft + 1) <= ft):
                original_text.append(text_token)
                variant_count += 1
                if variant_count == len(variant_tokens):
                    return ' '.join(original_text)
            else:
                original_text = []
                variant_count = 0
        return None
예제 #3
0
    def _get_entity_substring_from_text(self, variant, text):
        """
        Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance

        Args:
            variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text: text to detect entities from

        Returns:
            str or unicode: part of the given text that was detected as entity given the variant, None otherwise

        Example:
            text_detection = TextDetector('city')
            ...
            text_detection._get_entity_from_text(self, variant, text)
            text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            text_detection.get_entity_from_text('chennai', text)

            Output:
                'chennai'

            text_detection.get_entity_from_text('Delhi', text)

            Output:
                'delehi'
        """
        variant_tokens = TOKENIZER.tokenize(variant)
        text_tokens = TOKENIZER.tokenize(text)
        original_text_tokens = []
        variant_token_i = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_token_i]

            same = variant_token == text_token
            ft = self._get_fuzziness_threshold_for_token(text_token)
            if same or (len(text_token) > self._min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          max_distance=ft + 1) <= ft):
                original_text_tokens.append(text_token)
                variant_token_i += 1
                if variant_token_i == len(variant_tokens):
                    return self._get_substring_from_processed_text(
                        original_text_tokens)
            else:
                original_text_tokens = []
                variant_token_i = 0
        return None
예제 #4
0
    def _get_entity_substring_from_text(self, text, variant):
        """
        Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance
        and return the closest substring in the text that matches the variant

        Args:
            variant(str or unicode): string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text(str or unicode): sentence from self.processed on which detection is being done

        Returns:
            str or unicode or None: part of the given text that was detected as entity given the variant,
                                    None otherwise

        Example:
            >>> text_detector = TextDetector('city')
            >>> text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            >>> text_detector.detect_entity(text)
            >>> text_detector._get_entity_substring_from_text(variant='chennai')
            'chennai'
            >>> text_detector._get_entity_substring_from_text(variant='delhi')
            'delehi'

        """
        variant_tokens = TOKENIZER.tokenize(variant)
        text_tokens = TOKENIZER.tokenize(text)
        original_text_tokens = []
        variant_token_i = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_token_i]
            same = variant_token == text_token
            ft = self._get_fuzziness_threshold_for_token(text_token)
            if same or (len(text_token) > self._min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          max_distance=ft + 1) <= ft):
                original_text_tokens.append(text_token)
                variant_token_i += 1
                if variant_token_i == len(variant_tokens):
                    return self._get_substring_from_processed_text(
                        text, original_text_tokens)
            else:
                original_text_tokens = []
                variant_token_i = 0
        return None