class TesTrieWordInsert(unittest.TestCase): def test_word_add(self): self.trie = Trie() self.trie.add('axe') self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") def test_word_add_all_list(self): self.trie = Trie() self.trie.add_all(['axe', 'kick']) #list self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_set(self): self.trie = Trie() self.trie.add_all({'axe', 'kick'}) #set self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_tuple(self): self.trie = Trie() self.trie.add_all(('axe', 'kick')) #tuple self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_with_number(self): self.trie = Trie() self.trie.add_all(('axe', 'kick', 3)) #tuple with one integer. self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_gen(self): def gen_words(): a = ['ash', 'ashley', 'simpson'] for word in a: yield word self.trie = Trie() self.trie.add_all(gen_words()) # generator self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertTrue('simpson' in self.trie, "Word should be in trie") self.assertEqual(3, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_file_path(self): self.trie = Trie() self.trie.add_all(small_dataset) # From a file self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertTrue('simpson' in self.trie, "Word should be in trie") self.assertEqual(8, self.trie.get_word_count(), "Word count not equal")
class TrieApproxRecognizer(ConceptRecognizer): def __init__(self, dictionary_loader: DictionaryLoader, language="en", filters: List[AnnotationFilter] = None): """ Parameters ---------- dictionary_loader: DictionaryLoader The dictionary loader that will provide the dictionary contents language: str The language of the text that will processed (affects the choice of tokenner and stemmer). filters: List[AnnotationFilter] A list of filters to apply post recognition """ super().__init__(dictionary_loader, language=language, filters=filters) self.punctuation_remove = regex.compile( r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE) self.label_concept_index = {} self.label_token_counts = {} self.label_lengths = {} self.trie = Trie() if language == 'en': import en_core_web_md self.spacy = en_core_web_md.load() elif language == 'fr': import fr_core_web_md self.spacy = fr_core_web_md.load() else: raise ValueError(f"Unsupported language: {language}") def _load_concept_labels(self, concept_id, labels): label_index = 0 for label in labels: normalized = self.punctuation_remove.sub(" ", label).replace( "-", " ").lower() tokens, _ = span_tokenize(self.spacy, normalized) # For each token key = str(concept_id) + ":::" + str(label_index) self.label_concept_index[normalized] = key self.label_token_counts[normalized] = len(tokens) self.label_lengths[normalized] = len(normalized) self.trie.add(normalized, count=1) label_index += 1 def match_mentions( self, input_text ) -> Tuple[List[Tuple[int, int]], List[str], Set[Annotation]]: normalized_text = self.punctuation_remove.sub(" ", input_text).replace( "-", " ").lower() matches = [] tokens, spans = span_tokenize(self.spacy, normalized_text) current_token_index = 0 while current_token_index < len(tokens): current_match_cursor = 0 while current_token_index + current_match_cursor < len(tokens): sub_string = normalized_text[spans[current_token_index][0]: spans[current_match_cursor][1]] found = self.trie.search_within_distance(sub_string, dist=2) if len(found) > 0: # Register match print(len(found)) current_match_cursor += 1 else: break current_token_index += 1 return [], [], set()