Exemplo n.º 1
0
def assert_lang_supported_by_wordnet(lang):
    lang = iso_639_alpha3(lang)  # wordnet needs iso-639-2
    if lang in wn.langs():
        return True
    raise ValueError(
        f"Language '{lang}' not implemented in WordNet. Implemented languages are : {sorted(wn.langs())}"
    )
Exemplo n.º 2
0
 def test_alpha3(self):
     self.assertIsNone(iso_639_alpha3(''))
     self.assertIsNone(iso_639_alpha3(None))
     self.assertIsNone(iso_639_alpha3(6))
     self.assertIsNone(iso_639_alpha3('banana'))
     self.assertEquals(iso_639_alpha3('gub'), 'gub')
     self.assertEquals(iso_639_alpha3('en'), 'eng')
     self.assertEquals(iso_639_alpha3('eng'), 'eng')
     self.assertIsNone(iso_639_alpha3('yu'))
Exemplo n.º 3
0
 def test_alpha3(self):
     self.assertIsNone(iso_639_alpha3(""))
     self.assertIsNone(iso_639_alpha3(None))
     self.assertIsNone(iso_639_alpha3(6))
     self.assertIsNone(iso_639_alpha3("banana"))
     self.assertEquals(iso_639_alpha3("gub"), "gub")
     self.assertEquals(iso_639_alpha3("en"), "eng")
     self.assertEquals(iso_639_alpha3("eng"), "eng")
     self.assertIsNone(iso_639_alpha3("yu"))
Exemplo n.º 4
0
 def _locale_names(self, locale):
     names = {}
     for lang in self.LANGUAGES:
         names[lang] = lang
     for code, label in locale.languages.items():
         code = iso_639_alpha3(code)
         if code in self.LANGUAGES:
             names[code] = label
     return names
Exemplo n.º 5
0
 def clean_text(
     self,
     text: str,
     fuzzy: bool = False,
     format: Optional[str] = None,
     proxy: Optional["EntityProxy"] = None,
 ) -> Optional[str]:
     code = iso_639_alpha3(text)
     if code not in self.LANGUAGES:
         return None
     return code
Exemplo n.º 6
0
 def names(self):
     locale = get_locale()
     if locale not in self._names:
         self._names[locale] = {}
         for lang in self.LANGUAGES:
             self._names[locale][lang] = lang
         for code, label in locale.languages.items():
             code = iso_639_alpha3(code)
             if code in self.LANGUAGES:
                 self._names[locale][code] = label
     return self._names[locale]
    def get_iso_code(self, lang_attribute: str) -> str:
        """[summary]
        Converts html lang attribute to ISO 639-3 code

        Args:
            lang_attribute (str): lang attribute

        Returns:
            str: ISO 639-3 code
        """
        # https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/lang
        language_subtag = lang_attribute.split("-")[0]
        return languagecodes.iso_639_alpha3(language_subtag)
Exemplo n.º 8
0
def explore_wordnet(word: str,
                    lang: str,
                    max_depth: int = 5,
                    current_depth=1,
                    _previous_graph=None) -> rdflib.Graph:
    """Explore WordNet reccursively and return a rdf graph
    containing the synonyms, hyponyms, hypernyms.

    Starting from a word, it will look for its synonyms,
    hyponyms and hypernyms. And for each of these word, it
    will look again until the depth of recursion is reatched.

    Args:
        word (str): the word
        lang (str): language in ISO 639-1 code (eg: fr for French)
        current_depth (int): the depth of the reccursion

    Returns:
        a :obj:`Graph` object containing the terms


    .. code:: python

        >>> from wordnet_explorer.explorer import explore_wordnet
        RDFLib Version: 5.0.0
        >>> g = explore_wordnet('book', 'en', 1)
        >>> print(g)
        @prefix ns1: <http://www.w3.org/2004/02/skos/core#> .
        @prefix ns2: <http://www.w3.org/2006/03/wn/wn20/schema/> .
        @prefix ns3: <urn:default:baseUri:#> .
        @prefix ns4: <http://taxref.mnhn.fr/lod/property/> .
        @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

        ns3:brochure ns1:prefLabel "brochure" ;
            ns2:hyponymOf ns3:root_word_uri ;
            ns3:depth 1 ;
            ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/06410904-n> .


        ns3:album ns1:prefLabel "album" ;
            ns2:hyponymOf ns3:root_word_uri ;
            ns3:depth 1 ;
            ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/02870092-n> .

    """
    logging.debug(
        f"Exploring WORDNET with word '{word}' at depth '{current_depth}'")

    assert_lang_supported_by_wordnet(lang)
    lang = iso_639_alpha3(lang)

    if not _previous_graph:
        # initializing the Graph for the 1st time
        graph = Graph()
        # adding the root source
        graph.add_root_word(word)
    else:
        graph = _previous_graph

    if current_depth - 1 == max_depth:
        # reccursion ends
        return graph

    if not word in wn.words():
        logging.error(
            f"The word '{word}' is not contained in Wordnet. Returning an empty graph"
        )
        return graph

    for synset in wn.synsets(word, lang=lang):
        ss_uri = ("http://wordnet-rdf.princeton.edu/pwn30/" +
                  str(synset.offset()).zfill(8) + "-" + synset.pos())
        for new_word in synset.lemma_names(lang):
            if graph.word_in_graph(new_word):
                continue
            assert new_word != word
            graph.add_word(
                new_word,
                current_depth,
                "synonym",
                word,
                ss_uri,
                comesFrom="http://wordnet-rdf.princeton.edu/",
            )
            graph = explore_wordnet(
                new_word,
                lang,
                current_depth=current_depth + 1,
                max_depth=max_depth,
                _previous_graph=graph,
            )
        for hypernyms in synset.hypernyms():
            # colour is a hypernym of red
            for new_word in hypernyms.lemma_names(lang):
                if graph.word_in_graph(new_word):
                    continue
                assert new_word != word
                graph.add_word(
                    new_word,
                    current_depth,
                    "hypernym",
                    word,
                    ss_uri,
                    comesFrom="http://wordnet-rdf.princeton.edu/",
                )
                graph = explore_wordnet(
                    new_word,
                    lang,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )
        for hyponyms in synset.hyponyms():
            # spoon is a hyponym of cutlery
            for new_word in hyponyms.lemma_names(lang):
                if graph.word_in_graph(new_word):
                    continue
                assert new_word != word
                graph.add_word(
                    new_word,
                    current_depth,
                    "hyponym",
                    word,
                    ss_uri,
                    comesFrom="http://wordnet-rdf.princeton.edu/",
                )
                graph = explore_wordnet(
                    new_word,
                    lang,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )
        for holonyms in synset.member_holonyms():
            # word "face" is a holonym of the word "eye".
            for new_word in holonyms.lemma_names(lang):
                if graph.word_in_graph(new_word):
                    continue
                assert new_word != word
                graph.add_word(
                    new_word,
                    current_depth,
                    "holonym",
                    word,
                    ss_uri,
                    comesFrom="http://wordnet-rdf.princeton.edu/",
                )
                graph = explore_wordnet(
                    new_word,
                    lang,
                    current_depth=current_depth + 1,
                    max_depth=max_depth,
                    _previous_graph=graph,
                )

    return graph
Exemplo n.º 9
0
 def clean_text(self, text, **kwargs):
     code = iso_639_alpha3(text)
     if code in self.LANGUAGES:
         return code
Exemplo n.º 10
0
def answer(query: str, page: int) -> jsonify:
    queryBeforePreprocessing = query
    queryLanguage = inferLanguage(query)
    if getResourceFromDBPedia(
            queryBeforePreprocessing)["verification"] == False:
        spellCheckerSuggestions = spellChecker.lookup_compound(
            query, 2)  # LAST PARAMETER INDICATES MAX EDIT DISTANCE LOOKUP
        query = " ".join(
            [suggestion.term for suggestion in spellCheckerSuggestions])
    else:
        query = queryBeforePreprocessing
    query = query.lower().strip()
    if queryLanguage != "en":
        try:
            query = requests.get(
                "https://www.apertium.org/apy/translate",
                params={
                    'q':
                    query,
                    'markUnknown':
                    'no',
                    'langpair':
                    languagecodes.iso_639_alpha3(queryLanguage) + '|eng'
                }).json()['responseData']['translatedText']
        except:
            pass
            # return {
            # "answer": "Could not translate input.",
            # "small_summary": "Could not translate input.",
            # "corrected": query,
            # "urls": []
            # }
    try:
        q_vec = getSentenceMeanVector(query)
    except:
        return {
            "answer": "No relevant information available.",
            "small_summary": "No relevant information available.",
            "corrected": query,
            "urls": []
        }

    urls = loadMoreUrls(q_vec, queryLanguage, numberOfURLs, page)
    try:
        minimumScore = max(urls["scores"])
    except:
        minimumScore = 1

    listOfDataFromPeers = asyncio.run(
        getDataFromPeers(query, q_vec, queryLanguage, numberOfURLs, page,
                         minimumScore))
    if len(listOfDataFromPeers) > 0:
        listOfUrlsFromHost = list(zip(urls["urls"], urls["scores"]))
        listOfUrlsFromPeers = list(
            itertools.chain(*[pack["urls"] for pack in listOfDataFromPeers]))
        bigListOfUrls = listOfUrlsFromHost + listOfUrlsFromPeers
        bigListOfUrls.sort(key=lambda x: x[1])
        bigListOfUrls = [url[0] for url in bigListOfUrls]
    else:
        bigListOfUrls = urls["urls"]

    try:
        DBPediaDef = getDefinitionFromDBPedia(query)
    except:
        try:
            DBPediaDef = query + " = " + str(ne.evaluate(query)[()])
        except:
            DBPediaDef = "Brief description not found."

    bigListOfUrls = list(
        {frozenset(item.items()): item
         for item in bigListOfUrls}.values())

    if page == 1:
        registerQueryInAnalytics(query)
        return {
            "answer": escapeHTMLString(getAbstractFromDBPedia(query)),
            "small_summary": escapeHTMLString(DBPediaDef),
            "corrected": escapeHTMLString(query),
            "urls": bigListOfUrls[:15]
        }
    else:
        return {"urls": bigListOfUrls[:15]}