Python BigramCollocationFinder.nbest примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk.collocations

Метод/Функция: nbest

Примеров на hotexamples.com: 4

Python BigramCollocationFinder.nbest - 4 примера найдено. Это лучшие примеры Python кода для nltk.collocations.BigramCollocationFinder.nbest, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

from_words(30)

from_documents(25)

BigramCollocationFinder(9)

score_ngrams(5)

nbest(3)

apply_freq_filter(2)

Пример #1

Показать файл

Файл: preproc_fea_extraction.py Проект: yngwiet/Twitter-Sentiment-Analysis

    def get_unibigram_features(all_words, uni_feanum, bi_feanum):
        word_fd = nltk.FreqDist(all_words)
        bigram_fd = nltk.FreqDist(nltk.bigrams(all_words))

        if uni_feanum == 'max':
            uni_feanum = len(list(word_fd.keys()))
        elif uni_feanum > len(list(word_fd.keys())):
            uni_feanum = len(list(word_fd.keys()))

        if bi_feanum == 'max':
            bi_feanum = len(list(bigram_fd.keys()))
        elif bi_feanum > len(list(bigram_fd.keys())):
            bi_feanum = len(list(bigram_fd.keys()))

        finder = BigramCollocationFinder(word_fd, bigram_fd)
        bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum)

        print "the number of unigram features is", uni_feanum
        print "the number of bigram features is", bi_feanum

        featuples = word_fd.most_common(uni_feanum)

        selected_words = []

        for i in range(uni_feanum):
            selected_words.append(featuples[i][0])

        features = []
        for ngram in itertools.chain(selected_words, bigrams):
            features.append(ngram)

        return features

Пример #2

Показать файл

def findtopbigrams(bigrams,word_fd,settings):
    nkey = settings['nkey']
    measure = settings['measure']

    bigram_measures = BigramAssocMeasures()
    bigram_fd = FreqDist(bigrams)
    finder = BigramCollocationFinder(word_fd, bigram_fd)

    warning = ""

    if measure == "LR":
        try:
            top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, nkey)
        except:
            warning = "Problem with LR measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "PMI":
        try:
            top_bigrams = finder.nbest(bigram_measures.pmi, nkey)
        except:
            warning = "Problem with PMI measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "CHISQ":
        try:
            top_bigrams = finder.nbest(bigram_measures.chi_sq, nkey)
        except:
            warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "STUDT":
        try:
            top_bigrams = finder.nbest(bigram_measures.student_t, nkey)
        except:
            warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    else:
        top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)


    #score bigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip
    top_bg_with_count = sorted([(bg, count) for (bg, count) in finder.ngram_fd.items() if bg in top_bigrams], key=lambda bgcount:-bgcount[1])
    top_bigrams = [(bg, count) for (bg, count) in top_bg_with_count if count > 1 and bg[0]!=bg[1]]
    return top_bigrams, bigram_fd, warning

Пример #3

Показать файл

def print_samples(bigram_finder: BigramCollocationFinder) -> None:
    """Печатаем в лог top-10 словосочетаний по разным метрикам"""
    # Метрики для биграмм
    bigram_measures = BigramAssocMeasures()

    # Найдём лучшие по разным метрика слосовочетания
    _logger.info('Лучшие с/с по PMI:')
    for i, collocation in enumerate(
            bigram_finder.nbest(bigram_measures.pmi, 20)):
        _logger.info('%02d. %s (%d)', i + 1, collocation,
                     bigram_finder.ngram_fd[collocation])

    _logger.info('Лучшие с/с по t-score:')
    for i, collocation in enumerate(bigram_finder.nbest(t_score, 20)):
        _logger.info('%02d. %s (%d)', i + 1, collocation,
                     bigram_finder.ngram_fd[collocation])

    _logger.info('Лучшие с/с по Dice:')
    for i, collocation in enumerate(
            bigram_finder.nbest(bigram_measures.dice, 20)):
        _logger.info('%02d. %s (%d)', i + 1, collocation,
                     bigram_finder.ngram_fd[collocation])

Пример #4

Показать файл

Файл: t5_concordance.py Проект: makhidkarun/t5concordance

print "---------- 100 collocations -----------"
overall_text.collocations(num=100)
print "---------- ---------------- -----------"

print overall_text.concordance('Imperium')
index = nltk.text.ConcordanceIndex(master_tokens, key=lambda s:s.lower())
sys.exit(0)

from nltk import bigrams
from nltk import collocations
from nltk import FreqDist
from nltk.collocations import BigramCollocationFinder

# http://nltk.googlecode.com/svn/trunk/doc/howto/collocations.html
# http://stackoverflow.com/questions/9151326/python-nltk-find-collocations-without-dot-separated-words
bigram_measures = collocations.BigramAssocMeasures()
word_fd = FreqDist(master_tokens)
bigram_fd = FreqDist(bigrams(master_tokens))
finder = BigramCollocationFinder(word_fd, bigram_fd)

#finder.apply_word_filter(lambda w: w in ('.', ','))
# only when collocation occurs 3+ times
finder.apply_freq_filter(3)

scored = finder.score_ngrams(bigram_measures.raw_freq)
#print sorted(bigram for bigram, score in scored)
print "========================================="
print sorted(finder.nbest(bigram_measures.raw_freq,200),reverse=True)