Пример #1
0
def bigrams(unigram_stats, bigram_stats, measure="pmi", freq_filter=20):
    """Produce a list of scored bigrams. 
    
    Args:
        unigram_stats (FreqDist)
        bigram_stats (FreqDist)
        measure (str): a measure like "pmi" or "student_t". Should be an attribute of BigramAssocMeasures
        freq_filter (int): minimum number of occurences to consider a bigram
    """

    finder = BigramCollocationFinder(unigram_stats, bigram_stats)
    finder.apply_freq_filter(freq_filter)
    measures = BigramAssocMeasures()
    return finder.score_ngrams(getattr(measures, measure))
Пример #2
0
def load_ppmi(monograms, bigrams, pickle=None, cache=True):
    path = get_path(pickle)
    if cache and isfile(path):
        return pickle_load(path)

    ppmi = {}
    finder = BigramCollocationFinder(FreqDist(monograms), FreqDist(bigrams))
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    for words, score in finder.score_ngrams(bigram_measures.pmi):
        ppmi[words] = max(score, 0.0)

    print("Caching ppmi of length", len(ppmi))
    pickle_dump(ppmi, path)
    return ppmi
Пример #3
0
def getTopFeaturesForClass(documents, noOfFeaturesPerClass=10):
    ''' Feature values are in integer.
    [{document vector}, classId]
    '''
    classToFeaturesMap = defaultdict(list)
    word_fd = nltk.FreqDist(feature for doc in documents for feature, count in doc[0].iteritems() for i in range(count))
    for document, clusterId in documents:
        if clusterId not in word_fd: word_fd[clusterId]=0
        word_fd[clusterId]+=1
    bigram_fd = nltk.FreqDist((feature, doc[1]) for doc in documents for feature, count in doc[0].iteritems() for i in range(count))
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder(word_fd, bigram_fd)
    scored = finder.score_ngrams(bigram_measures.pmi)
    for (feature, classId), score in scored: classToFeaturesMap[classId].append((feature, score))
    returnData = []
    for classId, features in classToFeaturesMap.iteritems(): returnData.append((classId, features[:noOfFeaturesPerClass]))
    return returnData
Пример #4
0
def getTopFeaturesForClass(documents, noOfFeaturesPerClass=10):
    ''' Feature values are in integer.
    [{document vector}, classId]
    '''
    classToFeaturesMap = defaultdict(list)
    word_fd = nltk.FreqDist(feature for doc in documents
                            for feature, count in doc[0].iteritems()
                            for i in range(count))
    for document, clusterId in documents:
        if clusterId not in word_fd: word_fd[clusterId] = 0
        word_fd[clusterId] += 1
    bigram_fd = nltk.FreqDist((feature, doc[1]) for doc in documents
                              for feature, count in doc[0].iteritems()
                              for i in range(count))
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder(word_fd, bigram_fd)
    scored = finder.score_ngrams(bigram_measures.pmi)
    for (feature, classId), score in scored:
        classToFeaturesMap[classId].append((feature, score))
    returnData = []
    for classId, features in classToFeaturesMap.iteritems():
        returnData.append((classId, features[:noOfFeaturesPerClass]))
    return returnData
Пример #5
0
from nltk.collocations import BigramAssocMeasures
from nltk import FreqDist
from nltk import bigrams
from nltk.metrics import spearman

analyzer = MorphAnalyzer()
corpus = pd.read_csv("court-V-N.csv", header=None)
measures = BigramAssocMeasures()
tagger = lambda x: (x, analyzer.parse(x.lower().strip())[0].tag.POS)
tagged_corpus = corpus.applymap(tagger).drop(0, axis=1)
with open("gold_standard.txt", "r") as io:
    standard = [tuple(x.split()) for x in io.readlines()]
wfd = FreqDist(tagged_corpus.values.flatten())
bfd = FreqDist(bigrams(tagged_corpus.values.flatten()))
finder_1 = BigramCollocationFinder(wfd, bfd)

filter = lambda x: [tuple(z[0] for z in y[0]) for y in x if y[0][0][1] == "INFN"]

scored_pmi = filter(finder_1.score_ngrams(measures.pmi))
scored_student = filter(finder_1.score_ngrams(measures.student_t))
pmi_top = scored_pmi[:10]
student_top = scored_student[:10]

for name, top in [("pmi_top10.txt", pmi_top), ("student_top10.txt", student_top)]:
    with open(name, "w") as io:
        joined = [" ".join(x) + "\n" for x in top]
        io.writelines(joined)

print(spearman.spearman_correlation(pmi_top, student_top))
print("Done")
Пример #6
0
print "---------- 100 collocations -----------"
overall_text.collocations(num=100)
print "---------- ---------------- -----------"

print overall_text.concordance('Imperium')
index = nltk.text.ConcordanceIndex(master_tokens, key=lambda s:s.lower())
sys.exit(0)

from nltk import bigrams
from nltk import collocations
from nltk import FreqDist
from nltk.collocations import BigramCollocationFinder

# http://nltk.googlecode.com/svn/trunk/doc/howto/collocations.html
# http://stackoverflow.com/questions/9151326/python-nltk-find-collocations-without-dot-separated-words
bigram_measures = collocations.BigramAssocMeasures()
word_fd = FreqDist(master_tokens)
bigram_fd = FreqDist(bigrams(master_tokens))
finder = BigramCollocationFinder(word_fd, bigram_fd)

#finder.apply_word_filter(lambda w: w in ('.', ','))
# only when collocation occurs 3+ times
finder.apply_freq_filter(3)

scored = finder.score_ngrams(bigram_measures.raw_freq)
#print sorted(bigram for bigram, score in scored)
print "========================================="
print sorted(finder.nbest(bigram_measures.raw_freq,200),reverse=True)