Пример #1
0
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
    """
    Read the raw data from a wordlist file, returning it as a list of
    lists. (See `read_cBpack` for what this represents.)

    Because we use the `langcodes` module, we can handle slight
    variations in language codes. For example, looking for 'pt-BR',
    'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
    Looking up the alternate code 'por' will also get the same list.
    """
    available = available_languages(wordlist)
    best, score = langcodes.best_match(lang, list(available),
                                       min_score=match_cutoff)
    if score == 0:
        raise LookupError("No wordlist %r available for language %r"
                          % (wordlist, lang))

    if best != lang:
        logger.warning(
            "You asked for word frequencies in language %r. Using the "
            "nearest match, which is %r (%s)."
            % (lang, best, langcodes.get(best).language_name('en'))
        )

    return read_cBpack(available[best])
Пример #2
0
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
    """
    Read the raw data from a wordlist file, returning it as a list of
    lists. (See `read_cBpack` for what this represents.)

    Because we use the `langcodes` module, we can handle slight
    variations in language codes. For example, looking for 'pt-BR',
    'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
    Looking up the alternate code 'por' will also get the same list.
    """
    available = available_languages(wordlist)
    best, score = langcodes.best_match(lang,
                                       list(available),
                                       min_score=match_cutoff)
    if score == 0:
        raise LookupError("No wordlist %r available for language %r" %
                          (wordlist, lang))

    if best != lang:
        logger.warning(
            "You asked for word frequencies in language %r. Using the "
            "nearest match, which is %r (%s)." %
            (lang, best, langcodes.get(best).language_name('en')))

    return read_cBpack(available[best])
Пример #3
0
def _language_in_list(language, targets, min_score=80):
    """
    A helper function to determine whether this language matches one of the
    target languages, with a match score above a certain threshold.

    The languages can be given as strings (language tags) or as Language
    objects. `targets` can be any iterable of such languages.
    """
    matched = best_match(language, targets, min_score=min_score)
    return matched[1] > 0
Пример #4
0
def _language_in_list(language, targets, min_score=80):
    """
    A helper function to determine whether this language matches one of the
    target languages, with a match score above a certain threshold.

    The languages can be given as strings (language tags) or as Language
    objects. `targets` can be any iterable of such languages.
    """
    matched = best_match(language, targets, min_score=min_score)
    return matched[1] > 0
Пример #5
0
def process():
    # Install Open Multilingual Wordnet if not already installed.
    nltkd = nltk.downloader.Downloader()
    if not nltkd.is_installed('omw'):
        nltk.download('omw')

    # Figure out ISO 639-2 code for specified locale. Exit if unavailable.
    print args.language
    iso639_2 = langcodes.best_match(args.language, wn.langs())[0]
    print iso639_2
    print wn.langs()
    if iso639_2 == 'und': # Nearest ISO 639-2 code is undefined.
        exit("Requested language is not available on this NLTK Wordnet installation.")

    # Obtain set of lowercased lemmas that belong to only one part of speech.
    posdb = dict()
    single_pos_lemmas = set()
    for pos in ['a', 'r', 'n', 'v']:
        posdb[pos] = set()
        # Note: wn.all_lemma_names() returns the lemma names in all lowercase.
        # To remove lemmas that are sometimes or always capitalised in normal
        # writing (e.g. "China" or "Arulo"), we will need to obtain capitalised
        # lemmas from Wordnet later on, and remove members of our output set
        # that are identical to the lowercased transformation of those
        # capitalised lemmas.
        for lemma in wn.all_lemma_names(pos=pos, lang=iso639_2):
            posdb[pos].add(lemma)
        single_pos_lemmas.symmetric_difference_update(posdb[pos])

    # Remove lemmas containing characters other than a-z.
    output_set = set()
    for term in single_pos_lemmas:
        if non_word.search(term) != None:
            continue
        output_set.add(term)

    # Obtain a set of lemmas that are typically capitalised in normal writing.
    unlowered_lemmas = set()
    for synset in list(wn.all_synsets()):
        for lemma in synset.lemma_names():
            unlowered_lemmas.add(lemma)
    for word in output_set:
        lemmas =
    # Filter inspiration: http://stackoverflow.com/a/16562558
    output_set = filter(lambda x:len(x) > 4 and len(x) < 7, output_set)
    names_lowered = set()
    for name in nltk.corpus.names.words():
        names_lowered.add(name.lower())
    output_set = filter(lambda x: x not in names_lowered, output_set)
    print output_set
    # print single_pos_lemmas
    print len(single_pos_lemmas)
    print len(output_set)
Пример #6
0
def best_langtag_list(accept):
    enabledlangs = []
    for bcp47instance in BCP47.objects.filter(enabled=True):
        enabledlangs.append(bcp47instance.langtag)

    data = []
    for accept_lang, _ in parse_accept_lang_header(accept):
        match = best_match(accept_lang, enabledlangs)
        data.append({
            'langtag': match[0],
            'score': match[1],
            'accept_lang': accept_lang,
        })
    return data
Пример #7
0
    def get(self, request, **kwargs):
        enabledlangs = []
        for bcp47instance in BCP47.objects.filter(enabled=True):
            enabledlangs.append(bcp47instance.langtag)

        accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '')
        data = []
        for accept_lang, _ in parse_accept_lang_header(accept):
            match = best_match(accept_lang, enabledlangs)
            data.append({
                'langtag': match[0],
                'score': match[1],
                'accept_lang': accept_lang,
            })
        # results = BestlangtagSerializer(data, many=True).data
        return Response(data)
Пример #8
0
    def get_best_match_score(self, obj):
        # best_match(accept_lang, enabledlangs, min_score=50)
        accept_header = self.context['request'].META.get('HTTP_ACCEPT_LANGUAGE', '')
        accept_lang = []
        # Read https://docs.djangoproject.com/en/2.1/topics/i18n/translation/#internationalization-in-python-code
        for accepted, _q in parse_accept_lang_header(accept_header):
            accept_lang.append(accepted)

        # get best accepted match for this tag
        best = best_match(obj.langtag, accept_lang, min_score=50)
        # get the index of matched lang
        try:
            idx = accept_lang.index(best[0])
        except ValueError:
            return 0

        # return the score extracting the index, so the matched position at index
        # zero and score 100 will return 100. matched position at index 1 and
        # score 100 will return 99
        return best[1] - idx