import nltk from nltk.corpus import brown from nltk.probability import ConditionalFreqDist cond_freq_dist = ConditionalFreqDist( (g, w) for g in brown.categories() for w in brown.words(categories=g)) genres = ["government", "fiction", "mystery", "science_fiction", "adventure"] modal_verbs = ["may", "can", "could", "should", "must", "might", "will"] print(cond_freq_dist.tabulate(conditions=genres, samples=modal_verbs)) # Conditional frequency distributions are used to record the number of times each sample occurred. # Conditional frequency distributions are typically constructed by repeatedly running an experiment under a variety of conditions. ## output: # may can could should must might will # government 153 117 38 112 102 13 244 # fiction 8 37 166 35 55 44 52 # mystery 13 42 141 29 30 57 20 #science_fiction 4 16 49 3 8 12 16 # adventure 5 46 151 15 27 58 50 ##
from nltk.probability import ConditionalFreqDist from nltk.tokenize import word_tokenize sent = "the the the dog dog some other words that we do not care about" cfdist = ConditionalFreqDist() for word in word_tokenize(sent): condition = len(word) cfdist[condition][word] += 1 print(cfdist[3]) print(cfdist[3].freq('the')) print(cfdist[3]['dog']) print(cfdist.tabulate())
from nltk.corpus import brown from nltk.probability import ConditionalFreqDist genres = ['news', 'romance'] days = set("Monday Tuesday Wednesday Thursday Friday Saturday Sunday".split()) cfd = ConditionalFreqDist( (genre,word) for genre in genres for word in brown.words(categories=genre) if word in days ) cfd.tabulate()
def process_corpus(corpus_name): print(f'1. Corpus name: {corpus_name}') input_file = corpus_name + ".zip" corpus_contents = unzip_corpus(input_file) corpus_sentences = [] for content in corpus_contents: corpus_sentences.append(re.split(r'(?<=\.) ', content)) corpus_words = [] allwords = [] for sent in corpus_sentences: words = [] for word in sent: x = nltk.word_tokenize(word) words.append(x) for w in x: allwords.append(w.lower()) corpus_words.append(words) f = open(corpus_name + "-pos.txt", "w") allpos = [] for story in corpus_words: for sentence in story: sent = nltk.pos_tag(sentence) for word in sent: f.write(word[0] + "/" + word[1] + " ") allpos.append(word) f.write("\n\n") f.close() print(f'\n2. Total words in the corpus: {len(allwords)}') numunique = len(set(allwords)) print(f'\n3. Vocabulary size of the corpus: {numunique}') posfreq = {} for i in allpos: if i[1] in posfreq: posfreq[i[1]] += 1 else: posfreq[i[1]] = 1 inv = {v: k for k, v in posfreq.items()} sorted_posfreq = {k: inv[k] for k in sorted(inv)} l = list(sorted_posfreq.keys()) print( f'\n4. The most frequent part-of-speech tag is {sorted_posfreq.get(l[-1])} with frequency {l[-1]}' ) f = open(corpus_name + "-word-freq.txt", "w") fdist = FreqDist(word for word in allwords) fdist.pprint(maxlen=numunique, stream=f) f.close() cfdist = ConditionalFreqDist((word[1], word[0].lower()) for word in allpos) print( f'\n5. Frequencies and relative frequencies of all part-of-speech tags in the corpus in decreasing order of frequency are: ' ) for i in range(1, len(sorted_posfreq)): print( f'{sorted_posfreq.get(l[-i])} tag has frequency {l[-i]} and relative frequency {round(l[-i]/3676, 3)}.' ) f = open(corpus_name + "-pos-word-freq.txt", "w") with redirect_stdout(f): cfdist.tabulate() f.close() text = nltk.Text(allwords) pos_list = ["NN", "VBD", "JJ", "RB"] print("\n6.") for pos in pos_list: m = cfdist[pos].max() print( f'The most frequent word in the POS {pos} is {m} and its most similar words are:' ) text.similar(m) print(f'7. Collocations:') text.collocations()