def assert_lang_supported_by_wordnet(lang): lang = iso_639_alpha3(lang) # wordnet needs iso-639-2 if lang in wn.langs(): return True raise ValueError( f"Language '{lang}' not implemented in WordNet. Implemented languages are : {sorted(wn.langs())}" )
def test_alpha3(self): self.assertIsNone(iso_639_alpha3('')) self.assertIsNone(iso_639_alpha3(None)) self.assertIsNone(iso_639_alpha3(6)) self.assertIsNone(iso_639_alpha3('banana')) self.assertEquals(iso_639_alpha3('gub'), 'gub') self.assertEquals(iso_639_alpha3('en'), 'eng') self.assertEquals(iso_639_alpha3('eng'), 'eng') self.assertIsNone(iso_639_alpha3('yu'))
def test_alpha3(self): self.assertIsNone(iso_639_alpha3("")) self.assertIsNone(iso_639_alpha3(None)) self.assertIsNone(iso_639_alpha3(6)) self.assertIsNone(iso_639_alpha3("banana")) self.assertEquals(iso_639_alpha3("gub"), "gub") self.assertEquals(iso_639_alpha3("en"), "eng") self.assertEquals(iso_639_alpha3("eng"), "eng") self.assertIsNone(iso_639_alpha3("yu"))
def _locale_names(self, locale): names = {} for lang in self.LANGUAGES: names[lang] = lang for code, label in locale.languages.items(): code = iso_639_alpha3(code) if code in self.LANGUAGES: names[code] = label return names
def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: code = iso_639_alpha3(text) if code not in self.LANGUAGES: return None return code
def names(self): locale = get_locale() if locale not in self._names: self._names[locale] = {} for lang in self.LANGUAGES: self._names[locale][lang] = lang for code, label in locale.languages.items(): code = iso_639_alpha3(code) if code in self.LANGUAGES: self._names[locale][code] = label return self._names[locale]
def get_iso_code(self, lang_attribute: str) -> str: """[summary] Converts html lang attribute to ISO 639-3 code Args: lang_attribute (str): lang attribute Returns: str: ISO 639-3 code """ # https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/lang language_subtag = lang_attribute.split("-")[0] return languagecodes.iso_639_alpha3(language_subtag)
def explore_wordnet(word: str, lang: str, max_depth: int = 5, current_depth=1, _previous_graph=None) -> rdflib.Graph: """Explore WordNet reccursively and return a rdf graph containing the synonyms, hyponyms, hypernyms. Starting from a word, it will look for its synonyms, hyponyms and hypernyms. And for each of these word, it will look again until the depth of recursion is reatched. Args: word (str): the word lang (str): language in ISO 639-1 code (eg: fr for French) current_depth (int): the depth of the reccursion Returns: a :obj:`Graph` object containing the terms .. code:: python >>> from wordnet_explorer.explorer import explore_wordnet RDFLib Version: 5.0.0 >>> g = explore_wordnet('book', 'en', 1) >>> print(g) @prefix ns1: <http://www.w3.org/2004/02/skos/core#> . @prefix ns2: <http://www.w3.org/2006/03/wn/wn20/schema/> . @prefix ns3: <urn:default:baseUri:#> . @prefix ns4: <http://taxref.mnhn.fr/lod/property/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . ns3:brochure ns1:prefLabel "brochure" ; ns2:hyponymOf ns3:root_word_uri ; ns3:depth 1 ; ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/06410904-n> . ns3:album ns1:prefLabel "album" ; ns2:hyponymOf ns3:root_word_uri ; ns3:depth 1 ; ns3:synsetLink <http://wordnet-rdf.princeton.edu/pwn30/02870092-n> . """ logging.debug( f"Exploring WORDNET with word '{word}' at depth '{current_depth}'") assert_lang_supported_by_wordnet(lang) lang = iso_639_alpha3(lang) if not _previous_graph: # initializing the Graph for the 1st time graph = Graph() # adding the root source graph.add_root_word(word) else: graph = _previous_graph if current_depth - 1 == max_depth: # reccursion ends return graph if not word in wn.words(): logging.error( f"The word '{word}' is not contained in Wordnet. Returning an empty graph" ) return graph for synset in wn.synsets(word, lang=lang): ss_uri = ("http://wordnet-rdf.princeton.edu/pwn30/" + str(synset.offset()).zfill(8) + "-" + synset.pos()) for new_word in synset.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "synonym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) for hypernyms in synset.hypernyms(): # colour is a hypernym of red for new_word in hypernyms.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "hypernym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) for hyponyms in synset.hyponyms(): # spoon is a hyponym of cutlery for new_word in hyponyms.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "hyponym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) for holonyms in synset.member_holonyms(): # word "face" is a holonym of the word "eye". for new_word in holonyms.lemma_names(lang): if graph.word_in_graph(new_word): continue assert new_word != word graph.add_word( new_word, current_depth, "holonym", word, ss_uri, comesFrom="http://wordnet-rdf.princeton.edu/", ) graph = explore_wordnet( new_word, lang, current_depth=current_depth + 1, max_depth=max_depth, _previous_graph=graph, ) return graph
def clean_text(self, text, **kwargs): code = iso_639_alpha3(text) if code in self.LANGUAGES: return code
def answer(query: str, page: int) -> jsonify: queryBeforePreprocessing = query queryLanguage = inferLanguage(query) if getResourceFromDBPedia( queryBeforePreprocessing)["verification"] == False: spellCheckerSuggestions = spellChecker.lookup_compound( query, 2) # LAST PARAMETER INDICATES MAX EDIT DISTANCE LOOKUP query = " ".join( [suggestion.term for suggestion in spellCheckerSuggestions]) else: query = queryBeforePreprocessing query = query.lower().strip() if queryLanguage != "en": try: query = requests.get( "https://www.apertium.org/apy/translate", params={ 'q': query, 'markUnknown': 'no', 'langpair': languagecodes.iso_639_alpha3(queryLanguage) + '|eng' }).json()['responseData']['translatedText'] except: pass # return { # "answer": "Could not translate input.", # "small_summary": "Could not translate input.", # "corrected": query, # "urls": [] # } try: q_vec = getSentenceMeanVector(query) except: return { "answer": "No relevant information available.", "small_summary": "No relevant information available.", "corrected": query, "urls": [] } urls = loadMoreUrls(q_vec, queryLanguage, numberOfURLs, page) try: minimumScore = max(urls["scores"]) except: minimumScore = 1 listOfDataFromPeers = asyncio.run( getDataFromPeers(query, q_vec, queryLanguage, numberOfURLs, page, minimumScore)) if len(listOfDataFromPeers) > 0: listOfUrlsFromHost = list(zip(urls["urls"], urls["scores"])) listOfUrlsFromPeers = list( itertools.chain(*[pack["urls"] for pack in listOfDataFromPeers])) bigListOfUrls = listOfUrlsFromHost + listOfUrlsFromPeers bigListOfUrls.sort(key=lambda x: x[1]) bigListOfUrls = [url[0] for url in bigListOfUrls] else: bigListOfUrls = urls["urls"] try: DBPediaDef = getDefinitionFromDBPedia(query) except: try: DBPediaDef = query + " = " + str(ne.evaluate(query)[()]) except: DBPediaDef = "Brief description not found." bigListOfUrls = list( {frozenset(item.items()): item for item in bigListOfUrls}.values()) if page == 1: registerQueryInAnalytics(query) return { "answer": escapeHTMLString(getAbstractFromDBPedia(query)), "small_summary": escapeHTMLString(DBPediaDef), "corrected": escapeHTMLString(query), "urls": bigListOfUrls[:15] } else: return {"urls": bigListOfUrls[:15]}