def explore_reccursively( self, word: str, max_depth: int = 2, current_depth=1, _previous_graph=None, ) -> Graph: """Search for terms reccursively from the website Args: word (str): the word max_depth (int): the deepth of the reccursion Returns: a Graph object with the words that were looked up """ logging.debug( f"Exploring with word '{word}' current depth is '{current_depth}'") if not isinstance(max_depth, int): raise TypeError( f"max_depth type should be int not '{type(max_depth)}'") if not _previous_graph: # initializing the Graph for the 1st time graph = Graph() # adding the root source graph.add_root_word(word) else: graph = _previous_graph if current_depth - 1 == max_depth: # reccursion ends return graph else: new_words = [w for w in self._get_results_from_website(word) if w] logging.info(f"{len(new_words)} found") for n_word in new_words: if self.unidecode_word: n_word = unidecode(n_word.lower()) else: n_word = n_word.lower() if n_word in graph: logging.debug( f"n_word is already in the graph -> skipping it") continue graph.add_word(n_word, current_depth, "synonym", word, comesFrom=self.website) graph = self.explore_reccursively( n_word, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) return graph
def explore_wordnet(word: str, lang: str, max_depth: int = 5, current_depth=1, _previous_graph=None) -> rdflib.Graph: """Explore WordNet reccursively and return a rdf graph containing the synonyms, hyponyms, hypernyms. Starting from a word, it will look for its synonyms, hyponyms and hypernyms. And for each of these word, it will look again until the depth of recursion is reatched. Args: word (str): the word lang (str): language in ISO 639-1 code (eg: fr for French) current_depth (int): the depth of the reccursion Returns: a :obj:`Graph` object containing the terms .. code:: python >>> from wordnet_explorer.explorer import explore_wordnet RDFLib Version: 5.0.0 >>> g = explore_wordnet('book', 'en', 1) >>> print(g) @prefix ns1: <http://www.w3.org/2004/02/skos/core#> . @prefix ns2: <http://www.w3.org/2006/03/wn/wn20/schema/> . @prefix ns3: <urn:default:baseUri:#> . @prefix ns4: <http://taxref.mnhn.fr/lod/property/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . ns3:brochure ns1:prefLabel "brochure" ; ns2:hyponymOf ns3:root_word_uri ; ns3:depth 1 ; ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/06410904-n> . ns3:album ns1:prefLabel "album" ; ns2:hyponymOf ns3:root_word_uri ; ns3:depth 1 ; ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/02870092-n> . """ logging.debug( f"Exploring WORDNET with word '{word}' at depth '{current_depth}'") assert_lang_supported_by_wordnet(lang) lang = iso_639_alpha3(lang) if not _previous_graph: # initializing the Graph for the 1st time graph = Graph() # adding the root source graph.add_root_word(word) else: graph = _previous_graph if current_depth - 1 == max_depth: # reccursion ends return graph if not word in wn.words(): logging.error( f"The word '{word}' is not contained in Wordnet. Returning an empty graph" ) return graph for synset in wn.synsets(word, lang=lang): ss_uri = ("http://wordnet-rdf.princeton.edu/pwn30/" + str(synset.offset()).zfill(8) + "-" + synset.pos()) for new_word in synset.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "synonym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) for hypernyms in synset.hypernyms(): # colour is a hypernym of red for new_word in hypernyms.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "hypernym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) for hyponyms in synset.hyponyms(): # spoon is a hyponym of cutlery for new_word in hyponyms.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "hyponym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) for holonyms in synset.member_holonyms(): # word "face" is a holonym of the word "eye". for new_word in holonyms.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "holonym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) return graph
def explore_wolf( french_word: str, path_to_wolf: str, max_depth: int = 5, seeds=[], current_depth=1, _previous_graph=None, _wolf_object=None, ): """Explore the French wordnet WOLF like the `explore_wordnet()` function Args: word (str): the word path_to_wolf (str): "/path/to/wolf.xml" depth (int): the depth of the reccursion Returns: a :obj:`Graph` object containing the terms .. code:: python >>> from wordnet_explorer.explorer import explore_wolf >>> g = explore_wolf('fromage', '/path/to/wolf', 1) >>> print(g) @prefix ns1: <http://www.w3.org/2004/02/skos/core#> . @prefix ns2: <urn:default:baseUri:#> . @prefix ns3: <http://www.w3.org/2006/03/wn/wn20/schema/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . ns2:aliment ns1:prefLabel "aliment" ; ns3:hypernymOf ns2:root_word_uri ; ns2:depth 1 . ns2:alimentation ns1:prefLabel "alimentation" ; ns3:hypernymOf ns2:root_word_uri ; ns2:depth 1 . ns2:familier ns1:prefLabel "familier" ; ns3:hypernymOf ns2:root_word_uri ; ns2:depth 1 . ns2:laitage ns1:prefLabel "laitage" ; ns3:hypernymOf ns2:root_word_uri ; ns2:depth 1 . """ logging.debug( f"Exploring WOLF with word '{french_word}' at depth '{current_depth}'") logging.debug(f"seeds are '{seeds}'") new_seeds = seeds[:] if not _previous_graph: # initializing the Graph for the 1st time graph = Graph() # adding the root source graph.add_root_word(french_word) else: graph = _previous_graph logging.debug(f"graph root words: {graph.root_words}") if current_depth - 1 == max_depth: # reccursion ends return graph if not _wolf_object: # to avoid reading the file each time the function is called _wolf_object = FreNetic(path_to_wolf) if not _wolf_object.synsets(french_word): logging.error( f"The word '{french_word}' is not contained in '{path_to_wolf}'. Returning an empty graph" ) return graph if graph.root_words[0] in new_seeds and len(graph.root_words) > 1: new_seeds.remove(graph.root_words[0]) for synset in _wolf_object.synsets(french_word): # ss_uri = ( # "http://wordnet-rdf.princeton.edu/pwn30/" # + str(synset.offset()).zfill(8) # + "-" # + synset.pos() # ) for word in synset.literals(): word = str(word) if graph.word_in_graph(word): continue assert word != french_word, f"word: '{word}'\tcurrent_depth {current_depth}" # testing if synsets of the new word is in the seeds is_relevant = False for synset_to_check in _wolf_object.synsets(word): for word_synset in synset_to_check.literals(): if str(word_synset) in new_seeds: is_relevant = True break if is_relevant: break if is_relevant: graph.add_word(word, current_depth, "synonym", french_word, comesFrom=path_to_wolf) graph = explore_wolf( word, path_to_wolf, seeds=new_seeds, _wolf_object=_wolf_object, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) for hyp in synset.hypernyms(): for new_word in hyp.literals(): new_word = str(new_word) if graph.word_in_graph(new_word): continue assert new_word != french_word # testing if synsets of the new word is in the seeds is_relevant = False for synset_to_check in _wolf_object.synsets(new_word): for word_synset in synset_to_check.literals(): if str(word_synset) in new_seeds: is_relevant = True break if is_relevant: break if is_relevant: graph.add_word( new_word, current_depth, "hypernym", french_word, comesFrom=path_to_wolf, ) graph = explore_wolf( new_word, path_to_wolf, seeds=new_seeds, _wolf_object=_wolf_object, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) return graph
class TestGraph(unittest.TestCase): txt_out_file = "_.txt" xlsx_out_file = "_.xlsx" def setUp(self): self.g = Graph() touch(self.txt_out_file) touch(self.xlsx_out_file) def tearDown(self): # return os.remove(self.txt_out_file) os.remove(self.xlsx_out_file) def test_add_word(self): self.g.add_word("test", 5, "synonym", "target_word") def test___contains(self): self.g.add_word("test", 5, "synonym", "target_word") self.assertTrue("test" in self.g) self.assertFalse("notest" in self.g) def test_str(self): self.g.add_word("test", 5, "synonym", "target_word") self.assertIsInstance(self.g.to_str(), str) g2 = rdflib.Graph() g2.parse(data=str(self.g), format="ttl") def test_word_in_graph(self): self.g.add_word("test", 5, "synonym", "target_word") self.assertTrue(self.g.word_in_graph("test")) self.assertFalse(self.g.word_in_graph("tfdfdfest")) def test_to_text_file(self): self.g.parse(self.graph_test_path, format="ttl") self.g.to_text_file(self.txt_out_file) with open(self.txt_out_file) as f: words = sorted([line.strip() for line in f if line.strip()]) self.assertEqual(words, self.g.to_list()) def test_good_words(self): self.g.parse(self.graph_test_path, format="ttl") for word in self.g.to_list(): self.assertTrue(word) # no empty words self.assertEqual(unidecode(word.lower().strip()), word) def test_is_empty(self): self.assertTrue(self.g.is_empty()) self.assertRaises(AssertionError, self.g._set_root_word_attribute) rw_strings = [ "root_word_string_1", "root_word_string_2", "root_word_string_3" ] for w in rw_strings: self.g.add_root_word(w) self.assertTrue(self.g.is_empty()) self.assertEqual(rw_strings, self.g.to_list()) self.g._set_root_word_attribute() for w in self.g.root_words: self.assertTrue(isinstance(w, str)) def test_add_several_root_words(self): self.g.add_root_word("root_word_string_1") self.g.add_root_word("root_word_string_2") self.g.add_root_word("root_word_string_3") self.g.to_text_file(self.txt_out_file) with open(self.txt_out_file) as f: words = sorted([line.strip() for line in f if line.strip()]) self.assertEqual(words, self.g.to_list()) def test_add_several_root_words_with_previous_graph(self): self.g.parse(self.graph_test_path, format="ttl") self.g.add_root_word("root_word_string_1") self.g.add_root_word("root_word_string_2") self.g.add_root_word("root_word_string_3") self.g.to_text_file(self.txt_out_file) with open(self.txt_out_file) as f: words = sorted([line.strip() for line in f if line.strip()]) self.assertEqual(words, self.g.to_list()) self.test_list_is_sorted() def test_list_is_sorted(self): self.assertEqual(sorted(self.g.to_list()), self.g.to_list()) def test___len__(self): self.assertFalse(len(self.g)) for i in range(1, 10): self.g.add_root_word(f"test_{i}") self.assertEqual(len(self.g), i) self.g.add_word("test", 5, "synonym", "target_word") self.assertEqual(len(self.g), i + 1) def test_add_relation(self): self.g.add_root_word("root_word_string_1") self.g.add_word("test", 1, "synonym", "root_word_string_1") self.g.add_word("test", 1, "hyponym", "root_word_string_1") self.g.add_word("test", 1, "hypernym", "root_word_string_1") self.g.add_word("test", 1, "holonym", "root_word_string_1") self.assertRaises( ValueError, self.g.add_word, "test", 1, "thing_that_ends_with_nym", "root_word_string_1", ) def test_to_xlsx(self): self.g.add_root_word("dog") self.g.to_xlsx_file(self.xlsx_out_file) def test__get_maximum_origin(self): self.assertFalse(self.g._get_maximum_origin()) for i in range(1, 5): self.g.add(( rdflib.Literal("node_uri"), rdflib.URIRef("urn:default:baseUri:#comesFrom"), rdflib.Literal(f"test-{i}"), )) self.assertEqual(self.g._get_maximum_origin(), i) def test_pop_non_relevant_words(self): self.assertFalse(len(self.g)) for i in range(10): self.g.add_word("test", 1, "synonym", "target_word", comesFrom=f"test-{i}") self.g.add_word("test2", 1, "synonym", "target_word_2", comesFrom=f"test-x") self.assertEqual(len(self.g), 2) self.g.pop_non_relevant_words() self.assertEqual(len(self.g), 1)
def explore_nlp_model( word: str, model_path: str, max_depth: int = 5, current_depth=1, _previous_graph=None, _previous_model=None, ): """Explore the model reccursively and return a rdf graph containing the neighbour words. Args: word (str): the word model_path (str): the path of the nlp model current_depth (int): the depth of the reccursion Returns: a :obj:`Graph` object containing the terms .. code:: python >>> from lexicons_builder.nlp_model_explorer.explorer import explore_nlp_model >>> g = explore_nlp_model('test', '<path/to/model>', 2) >>> print(g) @prefix ns1: <http://taxref.mnhn.fr/lod/property/> . @prefix ns2: <http://www.w3.org/2004/02/skos/core#> . @prefix ns3: <urn:default:baseUri:#> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . ns3:annale_corriger ns1:isSynonymOf ns3:qcm ; ns2:prefLabel "annale_corriger" ; ns3:comesFrom </home/k/models/frWac_no_postag_phrase_500_cbow_cut100.bin> ; ns3:depth 2 . ns3:applicatif ns1:isSynonymOf ns3:test_unitaire ; ns2:prefLabel "applicatif" ; ns3:comesFrom </home/k/models/frWac_no_postag_phrase_500_cbow_cut100.bin> ; ns3:depth 2 . ns3:applications ns1:isSynonymOf ns3:tests ; ns2:prefLabel "applications" ; ns3:comesFrom </home/k/models/frWac_no_postag_phrase_500_cbow_cut100.bin> ; ns3:depth 2 . """ logging.debug( f"Exploring the model with word '{word}' current depth is '{current_depth}'" ) if not _previous_graph: # initializing the Graph for the 1st time graph = Graph() # adding the root source graph.add_root_word(word) else: graph = _previous_graph if current_depth - 1 == max_depth: # reccursion ends return graph if not _previous_model: logging.info(f"loading nlp model from '{model_path}'") # to avoid reading the file each time the function is invoked _previous_model = _load_model(model_path) if word not in _previous_model: # the model does not contain the original word return graph for new_word in [w[0] for w in _previous_model.most_similar(word)]: # add_word(self, word, depth, relation, target_word, synset_uri=None): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word(new_word, current_depth, "synonym", word, comesFrom=model_path) graph = explore_nlp_model( new_word, model_path, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, _previous_model=_previous_model, ) return graph