def __init__(self, dictionary_loader: DictionaryLoader, language="en", filters: List[AnnotationFilter] = None): """ Parameters ---------- dictionary_loader: DictionaryLoader The dictionary loader that will provide the dictionary contents language: str The language of the text that will processed (affects the choice of tokenner and stemmer). filters: List[AnnotationFilter] A list of filters to apply post recognition """ super().__init__(dictionary_loader, language=language, filters=filters) self.punctuation_remove = regex.compile( r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE) self.label_concept_index = {} self.label_token_counts = {} self.label_lengths = {} self.trie = Trie() if language == 'en': import en_core_web_md self.spacy = en_core_web_md.load() elif language == 'fr': import fr_core_web_md self.spacy = fr_core_web_md.load() else: raise ValueError(f"Unsupported language: {language}")
def test_word_add_all_set(self): self.trie = Trie() self.trie.add_all({'axe', 'kick'}) #set self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")
def test_word_add_all_with_number(self): self.trie = Trie() self.trie.add_all(('axe', 'kick', 3)) #tuple with one integer. self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")
def test_trie_node_count(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") self.assertEqual(7, len(self.trie), "Number of nodes")
def test_trie_wildcard_exception(self): self.trie = Trie() self.trie.add_all(['ab', 'as', 'ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertRaises(InvalidWildCardExpressionError, self.trie.search, '#$%^a')
def test_trie_question_search(self): self.trie = Trie() self.trie.add_all(['ab', 'as', 'ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertEqual(sorted(self.trie.search('a?')), sorted(['ab', 'as']), 'The lists should be equal')
class TestTrieExactWordSearch(unittest.TestCase): def test_word_in_trie(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley']) self.assertTrue('ash' in self.trie, "Word should be in trie") def test_word_not_int_trie(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley']) self.assertFalse('salary' in self.trie, "Word should not be in trie")
def test_trie_prefix_search(self): self.trie = Trie() self.trie.add_all(['ashlame', 'ashley', 'askoiu', 'ashlo']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertFalse('ash' in self.trie, "Word should not be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertEqual(4, self.trie.get_word_count(), "Word count not equal") self.assertTrue(self.trie.contains_prefix('ash'), "Prefix should be present in Trie") self.assertEqual(sorted(self.trie.search_with_prefix('ash')), sorted(['ashlame', 'ashley', 'ashlo']), 'The lists should be equal')
def test_trie_node_prefix_not_exists(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") self.assertFalse(self.trie.contains_prefix('xmas'), "Prefix should be present in Trie") self.assertFalse(self.trie.contains_prefix('xor'), "Prefix should be present in Trie") self.assertFalse(self.trie.contains_prefix('sh'), "Prefix should be present in Trie")
def test_word_add_all_gen(self): def gen_words(): a = ['ash', 'ashley', 'simpson'] for word in a: yield word self.trie = Trie() self.trie.add_all(gen_words()) # generator self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertTrue('simpson' in self.trie, "Word should be in trie") self.assertEqual(3, self.trie.get_word_count(), "Word count not equal")
def __build_tries(dataset_id: int, user_id: int) -> DatasetDictionary: """Creates lookup tries from levels and segments""" col_unique_values = get_column_unique_values(dataset_id, user_id) all_values = __condense_and_lemmatize(col_unique_values) level_trie = Trie() level_trie.add_all(all_values) colname_trie = Trie() colname_trie.add_all(list(col_unique_values.keys())) wordcount = colname_trie.get_word_count() + level_trie.get_word_count() return DatasetDictionary(dataset_id, colname_trie, level_trie, wordcount)
class TestWordCount(unittest.TestCase): def test_word_count_greater_than_zero(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley', 'ashes']) self.assertGreater(self.trie.get_word_count(), 0, "The number of words should be greater than 0") self.assertEqual(3, self.trie.get_word_count(), "Word count not equal") def test_word_count_zero(self): self.trie = Trie() self.trie.add_all([]) self.assertEqual(0, self.trie.get_word_count(), "Word count not equal")
def test_without_count(self): trie = Trie() trie.add_all(['ash', 'ashley', 'ashes', 'ashes']) expected = ['ash', 'ashley', 'ashes'] self.assertListEqual(expected, trie.search('a*'))
class TrieApproxRecognizer(ConceptRecognizer): def __init__(self, dictionary_loader: DictionaryLoader, language="en", filters: List[AnnotationFilter] = None): """ Parameters ---------- dictionary_loader: DictionaryLoader The dictionary loader that will provide the dictionary contents language: str The language of the text that will processed (affects the choice of tokenner and stemmer). filters: List[AnnotationFilter] A list of filters to apply post recognition """ super().__init__(dictionary_loader, language=language, filters=filters) self.punctuation_remove = regex.compile( r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE) self.label_concept_index = {} self.label_token_counts = {} self.label_lengths = {} self.trie = Trie() if language == 'en': import en_core_web_md self.spacy = en_core_web_md.load() elif language == 'fr': import fr_core_web_md self.spacy = fr_core_web_md.load() else: raise ValueError(f"Unsupported language: {language}") def _load_concept_labels(self, concept_id, labels): label_index = 0 for label in labels: normalized = self.punctuation_remove.sub(" ", label).replace( "-", " ").lower() tokens, _ = span_tokenize(self.spacy, normalized) # For each token key = str(concept_id) + ":::" + str(label_index) self.label_concept_index[normalized] = key self.label_token_counts[normalized] = len(tokens) self.label_lengths[normalized] = len(normalized) self.trie.add(normalized, count=1) label_index += 1 def match_mentions( self, input_text ) -> Tuple[List[Tuple[int, int]], List[str], Set[Annotation]]: normalized_text = self.punctuation_remove.sub(" ", input_text).replace( "-", " ").lower() matches = [] tokens, spans = span_tokenize(self.spacy, normalized_text) current_token_index = 0 while current_token_index < len(tokens): current_match_cursor = 0 while current_token_index + current_match_cursor < len(tokens): sub_string = normalized_text[spans[current_token_index][0]: spans[current_match_cursor][1]] found = self.trie.search_within_distance(sub_string, dist=2) if len(found) > 0: # Register match print(len(found)) current_match_cursor += 1 else: break current_token_index += 1 return [], [], set()
def test_word_add(self): self.trie = Trie() self.trie.add('axe') self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie")
class TesTrieWordInsert(unittest.TestCase): def test_word_add(self): self.trie = Trie() self.trie.add('axe') self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") def test_word_add_all_list(self): self.trie = Trie() self.trie.add_all(['axe', 'kick']) #list self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_set(self): self.trie = Trie() self.trie.add_all({'axe', 'kick'}) #set self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_tuple(self): self.trie = Trie() self.trie.add_all(('axe', 'kick')) #tuple self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_with_number(self): self.trie = Trie() self.trie.add_all(('axe', 'kick', 3)) #tuple with one integer. self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('axe' in self.trie, "Word should be in trie") self.assertTrue('kick' in self.trie, "Word should be in trie") self.assertEqual(2, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_gen(self): def gen_words(): a = ['ash', 'ashley', 'simpson'] for word in a: yield word self.trie = Trie() self.trie.add_all(gen_words()) # generator self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertTrue('simpson' in self.trie, "Word should be in trie") self.assertEqual(3, self.trie.get_word_count(), "Word count not equal") def test_word_add_all_file_path(self): self.trie = Trie() self.trie.add_all(small_dataset) # From a file self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertTrue('simpson' in self.trie, "Word should be in trie") self.assertEqual(8, self.trie.get_word_count(), "Word count not equal")
def test_word_not_int_trie(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley']) self.assertFalse('salary' in self.trie, "Word should not be in trie")
def test_word_in_trie(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley']) self.assertTrue('ash' in self.trie, "Word should be in trie")
def test_word_count_zero(self): self.trie = Trie() self.trie.add_all([]) self.assertEqual(0, self.trie.get_word_count(), "Word count not equal")
class TestWildCardSearch(unittest.TestCase): def test_trie_asterisk_search(self): self.trie = Trie() self.trie.add_all(['ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertEqual(sorted(self.trie.search('a*')), sorted(['ash', 'ashley']), 'The lists should be equal') def test_trie_question_search(self): self.trie = Trie() self.trie.add_all(['ab', 'as', 'ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertEqual(sorted(self.trie.search('a?')), sorted(['ab', 'as']), 'The lists should be equal') def test_trie_wildcard_search(self): self.trie = Trie() self.trie.add_all(['ab', 'as', 'ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertEqual(sorted(self.trie.search('*a******?')), sorted(['ab', 'as', 'ash', 'ashley']), 'The lists should be equal') def test_trie_wildcard_exception(self): self.trie = Trie() self.trie.add_all(['ab', 'as', 'ash', 'ashley']) self.assertIsInstance(self.trie, Trie, "Object should be of type `lexpy.trie.Trie`") self.assertTrue('ash' in self.trie, "Word should be in trie") self.assertTrue('ashley' in self.trie, "Word should be in trie") self.assertRaises(InvalidWildCardExpressionError, self.trie.search, '#$%^a')
def test_with_count(self): trie = Trie() trie.add_all(['ash', 'ashley', 'ashes', 'ashes']) expected = [('ash', 1), ('ashley', 1), ('ashes', 2)] self.assertListEqual(expected, trie.search('a*', with_count=True))