def _get_entity_substring_from_text(self, text, variant, entity_name): """ Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance and return the closest substring in the text that matches the variant. For each entity fuziness and min_token_size_for_fuzziness is used from the entity details. Args: variant(str or unicode): string, ngram of variant to fuzzy detect in the text using Levenshtein distance text(str or unicode): sentence from self.processed on which detection is being done entity_name (str): name of the entity to get fuzziness and min_token_lenght value Returns: str or unicode or None: part of the given text that was detected as entity given the variant, None otherwise Example: >>> text_detector = TextDetector(entity_dict={'city':{}) >>> text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() >>> text_detector._get_entity_substring_from_text(variant='chennai') 'chennai' >>> text_detector._get_entity_substring_from_text(variant='delhi') 'delehi' """ variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] variant_token_i = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] same = variant_token == text_token # get fuzziness and min_token_size_for_fuziness value from entity dict entity_dict = self.entities_dict.get(entity_name, {}) # get fuzziness from entity if not set default fuzziness = entity_dict.get('fuzziness') or self._fuzziness self.set_fuzziness_low_high_threshold(fuzziness) min_token_size_for_fuzziness = entity_dict.get( 'min_token_len_fuzziness') if not min_token_size_for_fuzziness: min_token_size_for_fuzziness = self._min_token_size_for_fuzziness ft = self._get_fuzziness_threshold_for_token(token=text_token) # set substitution cost to one if same or (len(text_token) > min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, substitution_cost=1, max_distance=ft + 1) <= ft): original_text_tokens.append(text_token) variant_token_i += 1 if variant_token_i == len(variant_tokens): return self._get_substring_from_processed_text( text, original_text_tokens) else: original_text_tokens = [] variant_token_i = 0 return None
def _get_entity_from_text(self, variant, text): """ Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance Args: variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance text: text to detect entities from Returns: part of the given text that was detected as entity given the variant, None otherwise Example: text_detection = TextDetector('city') ... text_detection._get_entity_from_text(self, variant, text) text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() text_detection.get_entity_from_text('chennai', text) Output: 'chennai' text_detection.get_entity_from_text('Delhi', text) Output: 'delehi' """ variant_tokens = tokenizer.tokenize(variant.lower()) text_tokens = tokenizer.tokenize(text.lower()) original_text = [] variant_count = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_count] utext_token = text_token if type(utext_token) == 'str': utext_token = utext_token.decode('utf-8') same = variant_token == text_token ft = self._get_fuzziness_threshold_for_token(utext_token) if same or (len(utext_token) > self._min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, max_distance=ft + 1) <= ft): original_text.append(text_token) variant_count += 1 if variant_count == len(variant_tokens): return ' '.join(original_text) else: original_text = [] variant_count = 0 return None
def _get_entity_substring_from_text(self, variant, text): """ Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance Args: variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance text: text to detect entities from Returns: str or unicode: part of the given text that was detected as entity given the variant, None otherwise Example: text_detection = TextDetector('city') ... text_detection._get_entity_from_text(self, variant, text) text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() text_detection.get_entity_from_text('chennai', text) Output: 'chennai' text_detection.get_entity_from_text('Delhi', text) Output: 'delehi' """ variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] variant_token_i = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] same = variant_token == text_token ft = self._get_fuzziness_threshold_for_token(text_token) if same or (len(text_token) > self._min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, max_distance=ft + 1) <= ft): original_text_tokens.append(text_token) variant_token_i += 1 if variant_token_i == len(variant_tokens): return self._get_substring_from_processed_text( original_text_tokens) else: original_text_tokens = [] variant_token_i = 0 return None
def _get_entity_substring_from_text(self, text, variant): """ Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance and return the closest substring in the text that matches the variant Args: variant(str or unicode): string, ngram of variant to fuzzy detect in the text using Levenshtein distance text(str or unicode): sentence from self.processed on which detection is being done Returns: str or unicode or None: part of the given text that was detected as entity given the variant, None otherwise Example: >>> text_detector = TextDetector('city') >>> text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() >>> text_detector.detect_entity(text) >>> text_detector._get_entity_substring_from_text(variant='chennai') 'chennai' >>> text_detector._get_entity_substring_from_text(variant='delhi') 'delehi' """ variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] variant_token_i = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] same = variant_token == text_token ft = self._get_fuzziness_threshold_for_token(text_token) if same or (len(text_token) > self._min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, max_distance=ft + 1) <= ft): original_text_tokens.append(text_token) variant_token_i += 1 if variant_token_i == len(variant_tokens): return self._get_substring_from_processed_text( text, original_text_tokens) else: original_text_tokens = [] variant_token_i = 0 return None