def get_frequency_list(lang, wordlist='best', match_cutoff=30): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) Because we use the `langcodes` module, we can handle slight variations in language codes. For example, looking for 'pt-BR', 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. Looking up the alternate code 'por' will also get the same list. """ available = available_languages(wordlist) best, score = langcodes.best_match(lang, list(available), min_score=match_cutoff) if score == 0: raise LookupError("No wordlist %r available for language %r" % (wordlist, lang)) if best != lang: logger.warning( "You asked for word frequencies in language %r. Using the " "nearest match, which is %r (%s)." % (lang, best, langcodes.get(best).language_name('en')) ) return read_cBpack(available[best])
def get_frequency_list(lang, wordlist='best', match_cutoff=30): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) Because we use the `langcodes` module, we can handle slight variations in language codes. For example, looking for 'pt-BR', 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. Looking up the alternate code 'por' will also get the same list. """ available = available_languages(wordlist) best, score = langcodes.best_match(lang, list(available), min_score=match_cutoff) if score == 0: raise LookupError("No wordlist %r available for language %r" % (wordlist, lang)) if best != lang: logger.warning( "You asked for word frequencies in language %r. Using the " "nearest match, which is %r (%s)." % (lang, best, langcodes.get(best).language_name('en'))) return read_cBpack(available[best])
def _language_in_list(language, targets, min_score=80): """ A helper function to determine whether this language matches one of the target languages, with a match score above a certain threshold. The languages can be given as strings (language tags) or as Language objects. `targets` can be any iterable of such languages. """ matched = best_match(language, targets, min_score=min_score) return matched[1] > 0
def process(): # Install Open Multilingual Wordnet if not already installed. nltkd = nltk.downloader.Downloader() if not nltkd.is_installed('omw'): nltk.download('omw') # Figure out ISO 639-2 code for specified locale. Exit if unavailable. print args.language iso639_2 = langcodes.best_match(args.language, wn.langs())[0] print iso639_2 print wn.langs() if iso639_2 == 'und': # Nearest ISO 639-2 code is undefined. exit("Requested language is not available on this NLTK Wordnet installation.") # Obtain set of lowercased lemmas that belong to only one part of speech. posdb = dict() single_pos_lemmas = set() for pos in ['a', 'r', 'n', 'v']: posdb[pos] = set() # Note: wn.all_lemma_names() returns the lemma names in all lowercase. # To remove lemmas that are sometimes or always capitalised in normal # writing (e.g. "China" or "Arulo"), we will need to obtain capitalised # lemmas from Wordnet later on, and remove members of our output set # that are identical to the lowercased transformation of those # capitalised lemmas. for lemma in wn.all_lemma_names(pos=pos, lang=iso639_2): posdb[pos].add(lemma) single_pos_lemmas.symmetric_difference_update(posdb[pos]) # Remove lemmas containing characters other than a-z. output_set = set() for term in single_pos_lemmas: if non_word.search(term) != None: continue output_set.add(term) # Obtain a set of lemmas that are typically capitalised in normal writing. unlowered_lemmas = set() for synset in list(wn.all_synsets()): for lemma in synset.lemma_names(): unlowered_lemmas.add(lemma) for word in output_set: lemmas = # Filter inspiration: http://stackoverflow.com/a/16562558 output_set = filter(lambda x:len(x) > 4 and len(x) < 7, output_set) names_lowered = set() for name in nltk.corpus.names.words(): names_lowered.add(name.lower()) output_set = filter(lambda x: x not in names_lowered, output_set) print output_set # print single_pos_lemmas print len(single_pos_lemmas) print len(output_set)
def best_langtag_list(accept): enabledlangs = [] for bcp47instance in BCP47.objects.filter(enabled=True): enabledlangs.append(bcp47instance.langtag) data = [] for accept_lang, _ in parse_accept_lang_header(accept): match = best_match(accept_lang, enabledlangs) data.append({ 'langtag': match[0], 'score': match[1], 'accept_lang': accept_lang, }) return data
def get(self, request, **kwargs): enabledlangs = [] for bcp47instance in BCP47.objects.filter(enabled=True): enabledlangs.append(bcp47instance.langtag) accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '') data = [] for accept_lang, _ in parse_accept_lang_header(accept): match = best_match(accept_lang, enabledlangs) data.append({ 'langtag': match[0], 'score': match[1], 'accept_lang': accept_lang, }) # results = BestlangtagSerializer(data, many=True).data return Response(data)
def get_best_match_score(self, obj): # best_match(accept_lang, enabledlangs, min_score=50) accept_header = self.context['request'].META.get('HTTP_ACCEPT_LANGUAGE', '') accept_lang = [] # Read https://docs.djangoproject.com/en/2.1/topics/i18n/translation/#internationalization-in-python-code for accepted, _q in parse_accept_lang_header(accept_header): accept_lang.append(accepted) # get best accepted match for this tag best = best_match(obj.langtag, accept_lang, min_score=50) # get the index of matched lang try: idx = accept_lang.index(best[0]) except ValueError: return 0 # return the score extracting the index, so the matched position at index # zero and score 100 will return 100. matched position at index 1 and # score 100 will return 99 return best[1] - idx