/
Girish_Srinivas_ch5a.py
115 lines (95 loc) · 3.95 KB
/
Girish_Srinivas_ch5a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
AUTHORS: GIRISH SRINIVAS
"""
#REFERENCE LINK:
"""
Here for our program, we have used the reference for Parts of Speech Tag-set
from the website: http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
"""
import nltk, re, pprint
from nltk.corpus import brown as bn
SimpleText='One day, his horse ran away. The neighbors came to express their concern: "Oh, that\'s too bad. How are you going to work the fields now?" The farmer replied: "Good thing, Bad thing, Who knows?" In a few days, his horse came back and brought another horse with her. Now, the neighbors were glad: "Oh, how lucky! Now you can do twice as much work as before!" The farmer replied: "Good thing, Bad thing, Who knows?"'
def exercise1():
print
print "Exercise 1:"
brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
unigram_tagger = nltk.UnigramTagger(brown_news_tagged_sents)
brown_news_eval = unigram_tagger.evaluate(brown_news_tagged_sents)
brown_lore_eval = unigram_tagger.evaluate(brown_lore_tagged_sents)
print "Evaluation of the unigram tagger on 'News': %f " % brown_news_eval
print "Evaluation of the unigram tagger on 'Lore': %f " % brown_lore_eval
brown_lore = bn.sents(categories = 'lore')
b_lore = unigram_tagger.tag(brown_lore[200])
print "Tagged words for 200th sentence of 'Brown' corpus of category 'Lore' is: "
print b_lore
print
def exercise2(category):
print
print "For Category: " + category
print "Part 1"
print "Words with the tag 'JJ':"
words = bn.tagged_words(categories = category)
wordlist = bn.words(categories = category)
words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
print len(words_JJ)
print
print "Part 2"
print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
print words_VBP_NNPS_NNS[:10]
print
sent = ""
print "Part 3"
print "The 3 most frequent 3-word prepositional phrases are:"
words = bn.tagged_words(categories = category)
for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words):
if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')):
sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "."
sent_part = sent.split(".")
fd = nltk.FreqDist(sent_part)
v = fd.most_common(3)
print v
print
print "Part 4"
print "Ratio of Masculine to Feminine is:"
male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
print
def exercise3():
print
print "Exercise 3"
print "Part 1"
count = 0
total_brown_tagged_words = bn.tagged_words()
cfd1 = nltk.ConditionalFreqDist(total_brown_tagged_words)
set1 = set([a for (a, b) in total_brown_tagged_words])
for s in set1:
if(len(cfd1[s].keys()) == 5):
count = count + 1
print "Number of words which have exactly 5 different tags: %d" % count
print
print "Part 2"
print "Words which have the most distinct tags are: "
tags = [b for (a, b) in bn.tagged_words()]
fd = nltk.FreqDist(tags)
ft = fd.keys()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words())
for a in ft:
if fd[a] == 1:
print "For POS: " +a
print cfd2[a].keys()
print
print
def main():
exercise1()
print "Exercise 2"
categories = ['humor', 'romance', 'government']
for c in categories:
exercise2(c)
exercise3()
if __name__ == "__main__":
main()