def get_unibigram_features(all_words, uni_feanum, bi_feanum): word_fd = nltk.FreqDist(all_words) bigram_fd = nltk.FreqDist(nltk.bigrams(all_words)) if uni_feanum == 'max': uni_feanum = len(list(word_fd.keys())) elif uni_feanum > len(list(word_fd.keys())): uni_feanum = len(list(word_fd.keys())) if bi_feanum == 'max': bi_feanum = len(list(bigram_fd.keys())) elif bi_feanum > len(list(bigram_fd.keys())): bi_feanum = len(list(bigram_fd.keys())) finder = BigramCollocationFinder(word_fd, bigram_fd) bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum) print "the number of unigram features is", uni_feanum print "the number of bigram features is", bi_feanum featuples = word_fd.most_common(uni_feanum) selected_words = [] for i in range(uni_feanum): selected_words.append(featuples[i][0]) features = [] for ngram in itertools.chain(selected_words, bigrams): features.append(ngram) return features
def findtopbigrams(bigrams,word_fd,settings): nkey = settings['nkey'] measure = settings['measure'] bigram_measures = BigramAssocMeasures() bigram_fd = FreqDist(bigrams) finder = BigramCollocationFinder(word_fd, bigram_fd) warning = "" if measure == "LR": try: top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, nkey) except: warning = "Problem with LR measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "PMI": try: top_bigrams = finder.nbest(bigram_measures.pmi, nkey) except: warning = "Problem with PMI measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "CHISQ": try: top_bigrams = finder.nbest(bigram_measures.chi_sq, nkey) except: warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "STUDT": try: top_bigrams = finder.nbest(bigram_measures.student_t, nkey) except: warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) else: top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) #score bigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip top_bg_with_count = sorted([(bg, count) for (bg, count) in finder.ngram_fd.items() if bg in top_bigrams], key=lambda bgcount:-bgcount[1]) top_bigrams = [(bg, count) for (bg, count) in top_bg_with_count if count > 1 and bg[0]!=bg[1]] return top_bigrams, bigram_fd, warning
def print_samples(bigram_finder: BigramCollocationFinder) -> None: """Печатаем в лог top-10 словосочетаний по разным метрикам""" # Метрики для биграмм bigram_measures = BigramAssocMeasures() # Найдём лучшие по разным метрика слосовочетания _logger.info('Лучшие с/с по PMI:') for i, collocation in enumerate( bigram_finder.nbest(bigram_measures.pmi, 20)): _logger.info('%02d. %s (%d)', i + 1, collocation, bigram_finder.ngram_fd[collocation]) _logger.info('Лучшие с/с по t-score:') for i, collocation in enumerate(bigram_finder.nbest(t_score, 20)): _logger.info('%02d. %s (%d)', i + 1, collocation, bigram_finder.ngram_fd[collocation]) _logger.info('Лучшие с/с по Dice:') for i, collocation in enumerate( bigram_finder.nbest(bigram_measures.dice, 20)): _logger.info('%02d. %s (%d)', i + 1, collocation, bigram_finder.ngram_fd[collocation])
print "---------- 100 collocations -----------" overall_text.collocations(num=100) print "---------- ---------------- -----------" print overall_text.concordance('Imperium') index = nltk.text.ConcordanceIndex(master_tokens, key=lambda s:s.lower()) sys.exit(0) from nltk import bigrams from nltk import collocations from nltk import FreqDist from nltk.collocations import BigramCollocationFinder # http://nltk.googlecode.com/svn/trunk/doc/howto/collocations.html # http://stackoverflow.com/questions/9151326/python-nltk-find-collocations-without-dot-separated-words bigram_measures = collocations.BigramAssocMeasures() word_fd = FreqDist(master_tokens) bigram_fd = FreqDist(bigrams(master_tokens)) finder = BigramCollocationFinder(word_fd, bigram_fd) #finder.apply_word_filter(lambda w: w in ('.', ',')) # only when collocation occurs 3+ times finder.apply_freq_filter(3) scored = finder.score_ngrams(bigram_measures.raw_freq) #print sorted(bigram for bigram, score in scored) print "=========================================" print sorted(finder.nbest(bigram_measures.raw_freq,200),reverse=True)