示例#1
0
import nltk
from nltk.corpus import brown
from nltk.probability import ConditionalFreqDist

cond_freq_dist = ConditionalFreqDist(
    (g, w) for g in brown.categories() for w in brown.words(categories=g))

genres = ["government", "fiction", "mystery", "science_fiction", "adventure"]
modal_verbs = ["may", "can", "could", "should", "must", "might", "will"]

print(cond_freq_dist.tabulate(conditions=genres, samples=modal_verbs))

# Conditional frequency distributions are used to record the number of times each sample occurred.
# Conditional frequency distributions are typically constructed by repeatedly running an experiment under a variety of conditions.

## output:
#             may    can  could should   must  might   will
#    government    153    117     38    112    102     13    244
#       fiction      8     37    166     35     55     44     52
#       mystery     13     42    141     29     30     57     20
#science_fiction      4     16     49      3      8     12     16
#     adventure      5     46    151     15     27     58     50
##
示例#2
0
from nltk.probability import ConditionalFreqDist
from nltk.tokenize import word_tokenize

sent = "the the the dog dog some other words that we do not care about"
cfdist = ConditionalFreqDist()

for word in word_tokenize(sent):
    condition = len(word)
    cfdist[condition][word] += 1

print(cfdist[3])
print(cfdist[3].freq('the'))
print(cfdist[3]['dog'])
print(cfdist.tabulate())
示例#3
0
from nltk.corpus import brown
from nltk.probability import ConditionalFreqDist



genres = ['news', 'romance']
days = set("Monday Tuesday Wednesday Thursday Friday Saturday Sunday".split())


cfd = ConditionalFreqDist(
	(genre,word)
	for genre in genres
	for word in brown.words(categories=genre)
	if word in days
	)

cfd.tabulate()
def process_corpus(corpus_name):
    print(f'1. Corpus name: {corpus_name}')
    input_file = corpus_name + ".zip"
    corpus_contents = unzip_corpus(input_file)
    corpus_sentences = []
    for content in corpus_contents:
        corpus_sentences.append(re.split(r'(?<=\.) ', content))
    corpus_words = []
    allwords = []
    for sent in corpus_sentences:
        words = []
        for word in sent:
            x = nltk.word_tokenize(word)
            words.append(x)
            for w in x:
                allwords.append(w.lower())
        corpus_words.append(words)
    f = open(corpus_name + "-pos.txt", "w")
    allpos = []
    for story in corpus_words:
        for sentence in story:
            sent = nltk.pos_tag(sentence)
            for word in sent:
                f.write(word[0] + "/" + word[1] + " ")
                allpos.append(word)
        f.write("\n\n")
    f.close()
    print(f'\n2. Total words in the corpus: {len(allwords)}')
    numunique = len(set(allwords))
    print(f'\n3. Vocabulary size of the corpus: {numunique}')
    posfreq = {}
    for i in allpos:
        if i[1] in posfreq:
            posfreq[i[1]] += 1
        else:
            posfreq[i[1]] = 1
    inv = {v: k for k, v in posfreq.items()}
    sorted_posfreq = {k: inv[k] for k in sorted(inv)}
    l = list(sorted_posfreq.keys())
    print(
        f'\n4. The most frequent part-of-speech tag is {sorted_posfreq.get(l[-1])} with frequency {l[-1]}'
    )
    f = open(corpus_name + "-word-freq.txt", "w")
    fdist = FreqDist(word for word in allwords)
    fdist.pprint(maxlen=numunique, stream=f)
    f.close()
    cfdist = ConditionalFreqDist((word[1], word[0].lower()) for word in allpos)
    print(
        f'\n5. Frequencies and relative frequencies of all part-of-speech tags in the corpus in decreasing order of frequency are: '
    )
    for i in range(1, len(sorted_posfreq)):
        print(
            f'{sorted_posfreq.get(l[-i])} tag has frequency {l[-i]} and relative frequency {round(l[-i]/3676, 3)}.'
        )
    f = open(corpus_name + "-pos-word-freq.txt", "w")
    with redirect_stdout(f):
        cfdist.tabulate()
    f.close()
    text = nltk.Text(allwords)
    pos_list = ["NN", "VBD", "JJ", "RB"]
    print("\n6.")
    for pos in pos_list:
        m = cfdist[pos].max()
        print(
            f'The most frequent word in the POS {pos} is {m} and its most similar words are:'
        )
        text.similar(m)
    print(f'7. Collocations:')
    text.collocations()