Пример #1
0
    def explore_reccursively(
        self,
        word: str,
        max_depth: int = 2,
        current_depth=1,
        _previous_graph=None,
    ) -> Graph:
        """Search for terms reccursively from the website

        Args:
            word (str): the word
            max_depth (int): the deepth of the reccursion
        Returns:
            a Graph object with the words that were looked up

        """

        logging.debug(
            f"Exploring with word '{word}' current depth is '{current_depth}'")

        if not isinstance(max_depth, int):
            raise TypeError(
                f"max_depth type should be int not '{type(max_depth)}'")

        if not _previous_graph:
            # initializing the Graph for the 1st time
            graph = Graph()
            # adding the root source
            graph.add_root_word(word)
        else:
            graph = _previous_graph

        if current_depth - 1 == max_depth:
            # reccursion ends
            return graph

        else:
            new_words = [w for w in self._get_results_from_website(word) if w]
            logging.info(f"{len(new_words)} found")
            for n_word in new_words:
                if self.unidecode_word:
                    n_word = unidecode(n_word.lower())
                else:
                    n_word = n_word.lower()
                if n_word in graph:
                    logging.debug(
                        f"n_word is already in the graph -> skipping it")
                    continue
                graph.add_word(n_word,
                               current_depth,
                               "synonym",
                               word,
                               comesFrom=self.website)
                graph = self.explore_reccursively(
                    n_word,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )
        return graph
Пример #2
0
def explore_wordnet(word: str,
                    lang: str,
                    max_depth: int = 5,
                    current_depth=1,
                    _previous_graph=None) -> rdflib.Graph:
    """Explore WordNet reccursively and return a rdf graph
    containing the synonyms, hyponyms, hypernyms.

    Starting from a word, it will look for its synonyms,
    hyponyms and hypernyms. And for each of these word, it
    will look again until the depth of recursion is reatched.

    Args:
        word (str): the word
        lang (str): language in ISO 639-1 code (eg: fr for French)
        current_depth (int): the depth of the reccursion

    Returns:
        a :obj:`Graph` object containing the terms


    .. code:: python

        >>> from wordnet_explorer.explorer import explore_wordnet
        RDFLib Version: 5.0.0
        >>> g = explore_wordnet('book', 'en', 1)
        >>> print(g)
        @prefix ns1: <http://www.w3.org/2004/02/skos/core#> .
        @prefix ns2: <http://www.w3.org/2006/03/wn/wn20/schema/> .
        @prefix ns3: <urn:default:baseUri:#> .
        @prefix ns4: <http://taxref.mnhn.fr/lod/property/> .
        @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

        ns3:brochure ns1:prefLabel "brochure" ;
            ns2:hyponymOf ns3:root_word_uri ;
            ns3:depth 1 ;
            ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/06410904-n> .


        ns3:album ns1:prefLabel "album" ;
            ns2:hyponymOf ns3:root_word_uri ;
            ns3:depth 1 ;
            ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/02870092-n> .

    """
    logging.debug(
        f"Exploring WORDNET with word '{word}' at depth '{current_depth}'")

    assert_lang_supported_by_wordnet(lang)
    lang = iso_639_alpha3(lang)

    if not _previous_graph:
        # initializing the Graph for the 1st time
        graph = Graph()
        # adding the root source
        graph.add_root_word(word)
    else:
        graph = _previous_graph

    if current_depth - 1 == max_depth:
        # reccursion ends
        return graph

    if not word in wn.words():
        logging.error(
            f"The word '{word}' is not contained in Wordnet. Returning an empty graph"
        )
        return graph

    for synset in wn.synsets(word, lang=lang):
        ss_uri = ("http://wordnet-rdf.princeton.edu/pwn30/" +
                  str(synset.offset()).zfill(8) + "-" + synset.pos())
        for new_word in synset.lemma_names(lang):
            if graph.word_in_graph(new_word):
                continue
            assert new_word != word
            graph.add_word(
                new_word,
                current_depth,
                "synonym",
                word,
                ss_uri,
                comesFrom="http://wordnet-rdf.princeton.edu/",
            )
            graph = explore_wordnet(
                new_word,
                lang,
                current_depth=current_depth + 1,
                max_depth=max_depth,
                _previous_graph=graph,
            )
        for hypernyms in synset.hypernyms():
            # colour is a hypernym of red
            for new_word in hypernyms.lemma_names(lang):
                if graph.word_in_graph(new_word):
                    continue
                assert new_word != word
                graph.add_word(
                    new_word,
                    current_depth,
                    "hypernym",
                    word,
                    ss_uri,
                    comesFrom="http://wordnet-rdf.princeton.edu/",
                )
                graph = explore_wordnet(
                    new_word,
                    lang,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )
        for hyponyms in synset.hyponyms():
            # spoon is a hyponym of cutlery
            for new_word in hyponyms.lemma_names(lang):
                if graph.word_in_graph(new_word):
                    continue
                assert new_word != word
                graph.add_word(
                    new_word,
                    current_depth,
                    "hyponym",
                    word,
                    ss_uri,
                    comesFrom="http://wordnet-rdf.princeton.edu/",
                )
                graph = explore_wordnet(
                    new_word,
                    lang,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )
        for holonyms in synset.member_holonyms():
            # word "face" is a holonym of the word "eye".
            for new_word in holonyms.lemma_names(lang):
                if graph.word_in_graph(new_word):
                    continue
                assert new_word != word
                graph.add_word(
                    new_word,
                    current_depth,
                    "holonym",
                    word,
                    ss_uri,
                    comesFrom="http://wordnet-rdf.princeton.edu/",
                )
                graph = explore_wordnet(
                    new_word,
                    lang,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )

    return graph
Пример #3
0
def explore_wolf(
    french_word: str,
    path_to_wolf: str,
    max_depth: int = 5,
    seeds=[],
    current_depth=1,
    _previous_graph=None,
    _wolf_object=None,
):
    """Explore the French wordnet WOLF like the `explore_wordnet()` function

    Args:
        word (str): the word
        path_to_wolf (str): "/path/to/wolf.xml"
        depth (int): the depth of the reccursion

    Returns:
        a :obj:`Graph` object containing the terms

    .. code:: python

        >>> from wordnet_explorer.explorer import explore_wolf
        >>> g = explore_wolf('fromage', '/path/to/wolf', 1)
        >>> print(g)
        @prefix ns1: <http://www.w3.org/2004/02/skos/core#> .
        @prefix ns2: <urn:default:baseUri:#> .
        @prefix ns3: <http://www.w3.org/2006/03/wn/wn20/schema/> .
        @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

        ns2:aliment ns1:prefLabel "aliment" ;
            ns3:hypernymOf ns2:root_word_uri ;
            ns2:depth 1 .

        ns2:alimentation ns1:prefLabel "alimentation" ;
            ns3:hypernymOf ns2:root_word_uri ;
            ns2:depth 1 .

        ns2:familier ns1:prefLabel "familier" ;
            ns3:hypernymOf ns2:root_word_uri ;
            ns2:depth 1 .

        ns2:laitage ns1:prefLabel "laitage" ;
            ns3:hypernymOf ns2:root_word_uri ;
            ns2:depth 1 .

    """
    logging.debug(
        f"Exploring WOLF with word '{french_word}' at depth '{current_depth}'")
    logging.debug(f"seeds are '{seeds}'")
    new_seeds = seeds[:]

    if not _previous_graph:
        # initializing the Graph for the 1st time
        graph = Graph()
        # adding the root source
        graph.add_root_word(french_word)
    else:
        graph = _previous_graph

    logging.debug(f"graph root words: {graph.root_words}")

    if current_depth - 1 == max_depth:
        # reccursion ends
        return graph

    if not _wolf_object:
        # to avoid reading the file each time the function is called
        _wolf_object = FreNetic(path_to_wolf)

    if not _wolf_object.synsets(french_word):
        logging.error(
            f"The word '{french_word}' is not contained in '{path_to_wolf}'. Returning an empty graph"
        )
        return graph

    if graph.root_words[0] in new_seeds and len(graph.root_words) > 1:
        new_seeds.remove(graph.root_words[0])

    for synset in _wolf_object.synsets(french_word):
        # ss_uri = (
        #     "http://wordnet-rdf.princeton.edu/pwn30/"
        #     + str(synset.offset()).zfill(8)
        #     + "-"
        #     + synset.pos()
        # )
        for word in synset.literals():
            word = str(word)
            if graph.word_in_graph(word):
                continue
            assert word != french_word, f"word: '{word}'\tcurrent_depth {current_depth}"

            # testing if synsets of the new word is in the seeds
            is_relevant = False
            for synset_to_check in _wolf_object.synsets(word):
                for word_synset in synset_to_check.literals():
                    if str(word_synset) in new_seeds:
                        is_relevant = True
                        break
                if is_relevant:
                    break
            if is_relevant:
                graph.add_word(word,
                               current_depth,
                               "synonym",
                               french_word,
                               comesFrom=path_to_wolf)
                graph = explore_wolf(
                    word,
                    path_to_wolf,
                    seeds=new_seeds,
                    _wolf_object=_wolf_object,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )
        for hyp in synset.hypernyms():
            for new_word in hyp.literals():
                new_word = str(new_word)
                if graph.word_in_graph(new_word):
                    continue
                assert new_word != french_word
                # testing if synsets of the new word is in the seeds
                is_relevant = False
                for synset_to_check in _wolf_object.synsets(new_word):
                    for word_synset in synset_to_check.literals():
                        if str(word_synset) in new_seeds:
                            is_relevant = True
                            break
                    if is_relevant:
                        break
                if is_relevant:
                    graph.add_word(
                        new_word,
                        current_depth,
                        "hypernym",
                        french_word,
                        comesFrom=path_to_wolf,
                    )
                    graph = explore_wolf(
                        new_word,
                        path_to_wolf,
                        seeds=new_seeds,
                        _wolf_object=_wolf_object,
                        current_depth=current_depth + 1,
                        max_depth=max_depth,
                        _previous_graph=graph,
                    )
    return graph
Пример #4
0
class TestGraph(unittest.TestCase):

    txt_out_file = "_.txt"
    xlsx_out_file = "_.xlsx"

    def setUp(self):
        self.g = Graph()
        touch(self.txt_out_file)
        touch(self.xlsx_out_file)

    def tearDown(self):
        # return
        os.remove(self.txt_out_file)
        os.remove(self.xlsx_out_file)

    def test_add_word(self):
        self.g.add_word("test", 5, "synonym", "target_word")

    def test___contains(self):
        self.g.add_word("test", 5, "synonym", "target_word")
        self.assertTrue("test" in self.g)
        self.assertFalse("notest" in self.g)

    def test_str(self):
        self.g.add_word("test", 5, "synonym", "target_word")
        self.assertIsInstance(self.g.to_str(), str)
        g2 = rdflib.Graph()
        g2.parse(data=str(self.g), format="ttl")

    def test_word_in_graph(self):
        self.g.add_word("test", 5, "synonym", "target_word")
        self.assertTrue(self.g.word_in_graph("test"))
        self.assertFalse(self.g.word_in_graph("tfdfdfest"))

    def test_to_text_file(self):
        self.g.parse(self.graph_test_path, format="ttl")
        self.g.to_text_file(self.txt_out_file)
        with open(self.txt_out_file) as f:
            words = sorted([line.strip() for line in f if line.strip()])
        self.assertEqual(words, self.g.to_list())

    def test_good_words(self):
        self.g.parse(self.graph_test_path, format="ttl")
        for word in self.g.to_list():
            self.assertTrue(word)  # no empty words
            self.assertEqual(unidecode(word.lower().strip()), word)

    def test_is_empty(self):
        self.assertTrue(self.g.is_empty())
        self.assertRaises(AssertionError, self.g._set_root_word_attribute)
        rw_strings = [
            "root_word_string_1", "root_word_string_2", "root_word_string_3"
        ]
        for w in rw_strings:
            self.g.add_root_word(w)
            self.assertTrue(self.g.is_empty())
        self.assertEqual(rw_strings, self.g.to_list())
        self.g._set_root_word_attribute()
        for w in self.g.root_words:
            self.assertTrue(isinstance(w, str))

    def test_add_several_root_words(self):
        self.g.add_root_word("root_word_string_1")
        self.g.add_root_word("root_word_string_2")
        self.g.add_root_word("root_word_string_3")
        self.g.to_text_file(self.txt_out_file)
        with open(self.txt_out_file) as f:
            words = sorted([line.strip() for line in f if line.strip()])
        self.assertEqual(words, self.g.to_list())

    def test_add_several_root_words_with_previous_graph(self):
        self.g.parse(self.graph_test_path, format="ttl")
        self.g.add_root_word("root_word_string_1")
        self.g.add_root_word("root_word_string_2")
        self.g.add_root_word("root_word_string_3")

        self.g.to_text_file(self.txt_out_file)
        with open(self.txt_out_file) as f:
            words = sorted([line.strip() for line in f if line.strip()])
        self.assertEqual(words, self.g.to_list())
        self.test_list_is_sorted()

    def test_list_is_sorted(self):
        self.assertEqual(sorted(self.g.to_list()), self.g.to_list())

    def test___len__(self):
        self.assertFalse(len(self.g))
        for i in range(1, 10):
            self.g.add_root_word(f"test_{i}")
            self.assertEqual(len(self.g), i)
        self.g.add_word("test", 5, "synonym", "target_word")
        self.assertEqual(len(self.g), i + 1)

    def test_add_relation(self):
        self.g.add_root_word("root_word_string_1")
        self.g.add_word("test", 1, "synonym", "root_word_string_1")
        self.g.add_word("test", 1, "hyponym", "root_word_string_1")
        self.g.add_word("test", 1, "hypernym", "root_word_string_1")
        self.g.add_word("test", 1, "holonym", "root_word_string_1")
        self.assertRaises(
            ValueError,
            self.g.add_word,
            "test",
            1,
            "thing_that_ends_with_nym",
            "root_word_string_1",
        )

    def test_to_xlsx(self):
        self.g.add_root_word("dog")
        self.g.to_xlsx_file(self.xlsx_out_file)

    def test__get_maximum_origin(self):
        self.assertFalse(self.g._get_maximum_origin())
        for i in range(1, 5):
            self.g.add((
                rdflib.Literal("node_uri"),
                rdflib.URIRef("urn:default:baseUri:#comesFrom"),
                rdflib.Literal(f"test-{i}"),
            ))
            self.assertEqual(self.g._get_maximum_origin(), i)

    def test_pop_non_relevant_words(self):
        self.assertFalse(len(self.g))
        for i in range(10):
            self.g.add_word("test",
                            1,
                            "synonym",
                            "target_word",
                            comesFrom=f"test-{i}")
        self.g.add_word("test2",
                        1,
                        "synonym",
                        "target_word_2",
                        comesFrom=f"test-x")
        self.assertEqual(len(self.g), 2)
        self.g.pop_non_relevant_words()
        self.assertEqual(len(self.g), 1)
Пример #5
0
def explore_nlp_model(
    word: str,
    model_path: str,
    max_depth: int = 5,
    current_depth=1,
    _previous_graph=None,
    _previous_model=None,
):
    """Explore the model reccursively and return a rdf graph
    containing the neighbour words.

    Args:
        word (str): the word
        model_path (str): the path of the nlp model
        current_depth (int): the depth of the reccursion

    Returns:
        a :obj:`Graph` object containing the terms

    .. code:: python

        >>> from lexicons_builder.nlp_model_explorer.explorer import explore_nlp_model
        >>> g = explore_nlp_model('test', '<path/to/model>', 2)
        >>> print(g)
        @prefix ns1: <http://taxref.mnhn.fr/lod/property/> .
        @prefix ns2: <http://www.w3.org/2004/02/skos/core#> .
        @prefix ns3: <urn:default:baseUri:#> .
        @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

        ns3:annale_corriger ns1:isSynonymOf ns3:qcm ;
            ns2:prefLabel "annale_corriger" ;
            ns3:comesFrom </home/k/models/frWac_no_postag_phrase_500_cbow_cut100.bin> ;
            ns3:depth 2 .

        ns3:applicatif ns1:isSynonymOf ns3:test_unitaire ;
            ns2:prefLabel "applicatif" ;
            ns3:comesFrom </home/k/models/frWac_no_postag_phrase_500_cbow_cut100.bin> ;
            ns3:depth 2 .

        ns3:applications ns1:isSynonymOf ns3:tests ;
            ns2:prefLabel "applications" ;
            ns3:comesFrom </home/k/models/frWac_no_postag_phrase_500_cbow_cut100.bin> ;
            ns3:depth 2 .

    """
    logging.debug(
        f"Exploring the model with word '{word}' current depth is '{current_depth}'"
    )

    if not _previous_graph:
        # initializing the Graph for the 1st time
        graph = Graph()
        # adding the root source
        graph.add_root_word(word)
    else:
        graph = _previous_graph

    if current_depth - 1 == max_depth:
        # reccursion ends
        return graph

    if not _previous_model:
        logging.info(f"loading nlp model from '{model_path}'")
        # to avoid reading the file each time the function is invoked
        _previous_model = _load_model(model_path)

    if word not in _previous_model:
        # the model does not contain the original word
        return graph

    for new_word in [w[0] for w in _previous_model.most_similar(word)]:
        # add_word(self, word, depth, relation, target_word, synset_uri=None):
        if graph.word_in_graph(new_word):
            continue
        assert new_word != word
        graph.add_word(new_word,
                       current_depth,
                       "synonym",
                       word,
                       comesFrom=model_path)
        graph = explore_nlp_model(
            new_word,
            model_path,
            current_depth=current_depth + 1,
            max_depth=max_depth,
            _previous_graph=graph,
            _previous_model=_previous_model,
        )

    return graph