def demo(): root = Tk() root.bind('<Control-q>', lambda e: root.destroy()) table = Table(root, 'Word Synset Hypernym Hyponym'.split(), column_weights=[0, 1, 1, 1], reprfunc=(lambda i,j,s: ' %s' % s)) table.pack(expand=True, fill='both') from nltk.corpus import wordnet from nltk.corpus import brown for word, pos in sorted(set(brown.tagged_words()[:500])): if pos[0] != 'N': continue word = word.lower() for synset in wordnet.synsets(word): hyper = (synset.hypernyms()+[''])[0] hypo = (synset.hyponyms()+[''])[0] table.append([word, getattr(synset, 'definition', '*none*'), getattr(hyper, 'definition', '*none*'), getattr(hypo, 'definition', '*none*')]) table.columnconfig('Word', background='#afa') table.columnconfig('Synset', background='#efe') table.columnconfig('Hypernym', background='#fee') table.columnconfig('Hyponym', background='#ffe') for row in range(len(table)): for column in ('Hypernym', 'Hyponym'): if table[row, column] == '*none*': table.itemconfig(row, column, foreground='#666', selectforeground='#666') root.mainloop()
def exercise3(): print print "Exercise 3" print "Part 1" count = 0 total_brown_tagged_words = bn.tagged_words() cfd1 = nltk.ConditionalFreqDist(total_brown_tagged_words) set1 = set([a for (a, b) in total_brown_tagged_words]) for s in set1: if (len(cfd1[s].keys()) == 5): count = count + 1 print "Number of words which have exactly 5 different tags: %d" % count print print "Part 2" print "Words which have the most distinct tags are: " tags = [b for (a, b) in bn.tagged_words()] fd = nltk.FreqDist(tags) ft = fd.keys() cfd2 = nltk.ConditionalFreqDist( (tag, word) for (word, tag) in bn.tagged_words()) for a in ft: if fd[a] == 1: print "For POS: " + a print cfd2[a].keys() print print
def exploreTaggedCorpora(): brown_learned_text = brown.words(categories="learned") sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often")) brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"] fd = nltk.FreqDist(tags) fd.tabulate() def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if t1.startswith("V") and t2 == "TO" and t3.startswith("V"): print w1, w2, w3 for tagged_sent in brown.tagged_sents(): process(tagged_sent) brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged) for word in data.conditions(): if len(data[word]) > 3: tags = data[word].keys() print word, " ".join(tags)
def exercise2(category): print print "For Category: " + category print "Part 1" print "Words with the tag 'JJ':" words = bn.tagged_words(categories = category) wordlist = bn.words(categories = category) words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ'])) print len(words_JJ) print print "Part 2" print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:" words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS'] print words_VBP_NNPS_NNS[:10] print sent = "" print "Part 3" print "The 3 most frequent 3-word prepositional phrases are:" words = bn.tagged_words(categories = category) for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words): if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')): sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "." sent_part = sent.split(".") fd = nltk.FreqDist(sent_part) v = fd.most_common(3) print v print print "Part 4" print "Ratio of Masculine to Feminine is:" male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b' female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b' male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())]) female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())]) print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns) print
def verb_stem(s): """extracts the stem from the 3sg form of a verb, or returns empty string""" # add code here if re.match(".*[aeiou]ys$", s): snew = s[:-1] elif re.match(".*([^sxyzaeiou]|[^cs]h)s$", s): snew = s[:-1] elif re.match("[^aeiou]ies$", s): snew = s[:-1] elif re.match(".*[^s]ses$", s): snew = s[:-1] elif re.match(".*[^z]zes$", s): snew = s[:-1] elif re.match(".*([^iosxzh]|[^cs]h)es$", s): snew = s[:-1] elif s == "has": snew = "have" elif len(s) >= 5 and re.match(".*[^aeiou]ies$", s): snew = s[:-3] + 'y' elif re.match(".*([ox]|[cs]h|ss|zz)es$", s): snew = s[:-2] else: snew = "" if snew != "" and snew != "have": if not ((snew, "VB") in (brown.tagged_words()) and (s, "VBZ") in (brown.tagged_words())): snew = "" return snew
def __init__(self): """Initialize your data structures in the constructor.""" tag_corpus = [] # from nltk.corpus import treebank # corpus = treebank.tagged_words() # for (word,tag) in treebank.tagged_words(): # tag_corpus.append(tag) from nltk.corpus import brown corpus = brown.tagged_words() for (word,tag) in brown.tagged_words(): tag_corpus.append(tag) self.wordCounts = collections.defaultdict(int) self.tagCounts = collections.defaultdict(int) self.wordTagCounts = collections.defaultdict(int) self.wordTagList = {} self.totalTag = 0 self.train(corpus) #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) estimator = _estimator self.tagLM = NgramModel(2, tag_corpus, estimator)
def exercise3(): print print "Exercise 3" print "Part 1" count = 0 total_brown_tagged_words = bn.tagged_words() cfd1 = nltk.ConditionalFreqDist(total_brown_tagged_words) set1 = set([a for (a, b) in total_brown_tagged_words]) for s in set1: if(len(cfd1[s].keys()) == 5): count = count + 1 print "Number of words which have exactly 5 different tags: %d" % count print print "Part 2" print "Words which have the most distinct tags are: " tags = [b for (a, b) in bn.tagged_words()] fd = nltk.FreqDist(tags) ft = fd.keys() cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words()) for a in ft: if fd[a] == 1: print "For POS: " +a print cfd2[a].keys() print print
def exercise1(): # 进行词性标注 text = nltk.word_tokenize("You are a good man, but i don't like you!") print(text) print(nltk.pos_tag(text)) nltk.tag.pos_tag() words_tag = brown.tagged_words(categories='news') print(words_tag[:30]) words_tag = brown.tagged_words(categories='news', tagset='universal') print(words_tag[:30]) words_tag = brown.tagged_words(categories='news', tagset='wsj') print(words_tag[:30]) words_tag = brown.tagged_words(categories='news', tagset='brown') print(words_tag[:30]) words_tag = sinica_treebank.tagged_sents() print(words_tag) raw = "You are a good man, but i don't love you!" tokens = nltk.word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') lagged_words = default_tagger.tag(tokens) print(lagged_words) tagged_sents = brown.tagged_sents(categories='news') print(default_tagger.evaluate(tagged_sents))
def verb_stem(s): """extracts the stem from the 3sg form of a verb, or returns empty string""" # goes through rules outlined in handout if re.match ("has", s): toReturn = 'have' elif re.match (".*(ays|eys|iys|oys|uys)", s): toReturn = s[:-1] elif re.match (".*(ies)", s): if (len(s) == 4): toReturn = s[:-1] else: s1 = s[:-3] s2 = s1 + "y" toReturn = s2 elif re.match(".*(oes|xes|ches|shes|sses|zzes)", s): toReturn = s[:-2] elif re.match (".*(!sses|!zzes|ses|zes)", s): toReturn = s[:-1] elif re.match(".*(!ies|!oes|!ses|!xes|!ches|!shes|es)", s): toReturn = s[:-1] elif re.match(".*(!ss|!xs|!ys|!zs|!chs|!shs|s)", s): toReturn = s[:-1] else: toReturn = '' # will check if original plural or creted singular verb is in the Brown corpus. if ((s, 'VBZ') not in brown.tagged_words()): if ((toReturn, 'VB') not in brown.tagged_words()): return '' else: return toReturn else: return toReturn
def verb_stem(s): """extracts the stem from the 3sg form of a verb, or returns empty string""" vowel_s = "aieou" verb = "" if re.match(".*ies$", s): if len(s) == 4 and not s[0] in vowel_s: verb = s[:-1] #working else: verb = s[:-3] + 'y' #working elif re.match(".*es$", s): if re.match(".*(o|x|sh|ch|ss|zz)es$", s): verb = s[:-2] elif re.match(".*[^(sxyz)]es$", s) and s[-4:-2] != "sh" and s[-4:-2] != "ch": verb = s[:-1] elif re.match(".*(([^s]s)|([^z]z))es$", s): verb = s[:-1] elif re.match(".*s$", s): if (s[-2] == 'y' and s[-3] in vowel_s): #working verb = s[:-1] elif re.match(".*[^sxyz]s$", s) and s[-4:-2] != "sh" and s[-4:-2] != "ch": #working verb = s[:-1] elif s == "has": # working verb = "have" else: return s if not ((s, "VBZ") in set(brown.tagged_words()) and (verb, "VB") in set(brown.tagged_words())): verb = "" return verb
def exploreTaggedCorpora(): brown_learned_text = brown.words(categories='learned') sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often')) brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often'] fd = nltk.FreqDist(tags) fd.tabulate() def process(sentence): for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence): if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')): print w1, w2, w3 for tagged_sent in brown.tagged_sents(): process(tagged_sent) brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True) data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged) for word in data.conditions(): if len(data[word]) > 3: tags = data[word].keys() print word, ' '.join(tags)
def tagged_token_representation(): print nltk.tag.str2tuple("fly/NN") from nltk.corpus import brown print brown.tagged_words() # distribution of tags brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged) print tag_fd tag_fd.plot(cumulative=True) # distribution of POS+N pairs word_tag_pairs = nltk.bigrams(brown_news_tagged) print nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == "N")
def automaticTagging(): from nltk.corpus import brown print "=============== The Default Tagger ===============" brown_tagged_sents = brown.tagged_sents(categories='news') print brown_tagged_sents[0:3] brown_sents = brown.sents(categories='news') print brown_sents[0:3] tags = [tag for (word, tag) in brown.tagged_words(categories='news')] print nltk.FreqDist(tags).max() raw = 'I do not like green eggs and ham, I do not like them Sam I am!' tokens = nltk.word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') print default_tagger.tag(tokens) print default_tagger.evaluate(brown_tagged_sents) print "=============== The Regular Expression Tagger ===============" patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN') ] regexp_tagger = nltk.RegexpTagger(patterns) print regexp_tagger.tag(brown_sents[3]) print regexp_tagger.evaluate(brown_tagged_sents) print "=============== The Lookup Tagger ===============" fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:100] print most_freq_words likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags) print baseline_tagger print baseline_tagger.evaluate(brown_tagged_sents) def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news')) def display(): import pylab words_by_freq = list(nltk.FreqDist(brown.words(categories='news'))) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show() display()
def primary(): print("Training on the adventure category ...") train_corpus = brown.tagged_words(categories='adventure', tagset='universal') wprobs, tprobs = training([y[0] for y in train_corpus], [y[1] for y in train_corpus]) print("Corpus Trained") wprobs, wprobs1, tprobs, tprobs1, tsquash, tsquash1 = seperate_training( wprobs, tprobs) test_cats = brown.categories() vit_matrix = np.zeros((NUM_TAGS, NUM_TAGS)) bi_matrix = np.zeros((NUM_TAGS, NUM_TAGS)) for test_cat in test_cats: print("Testing on " + test_cat + "...") test_corpus = brown.tagged_words(categories=test_cat, tagset='universal') test_words = list_to_int([y[0] for y in test_corpus], UNIQUE_WORDS) test_tags = list_to_int([y[1] for y in test_corpus], UNIQUE_TAGS) fwd_vit_acc, vit_set = fwd_viterbi(test_words, test_tags, wprobs1, tprobs1, tsquash1) print("Forward Viterbi resulted in accuracy: " + str(fwd_vit_acc)) bi_dir_acc, bi_set = bi_dir_method(test_words, test_tags, wprobs, tprobs, tsquash) print("Bidirection Method resulted in accuracy: " + str(bi_dir_acc)) for i in range(len(test_tags)): vit_matrix[test_tags[i], vit_set[i]] += 1 bi_matrix[test_tags[i], bi_set[i]] += 1 np.set_printoptions(suppress=True) print("The Confusion Matrix for Viterbi:") print(vit_matrix) print("The Confusion Matrix for Bidirectional:") print(bi_matrix) for i in range(NUM_TAGS): print("Accuracy of '" + UNIQUE_TAGS[i] + "' tagging on Viterbi: " + str(vit_matrix[i, i] / sum(vit_matrix[i]))) print("Accuracy of '" + UNIQUE_TAGS[i] + "' tagging on Bidirection: " + str(bi_matrix[i, i] / sum(bi_matrix[i]))) print("misguess of '" + UNIQUE_TAGS[i] + "' on Viterbi: " + str(1 - vit_matrix[i, i] / sum(vit_matrix[:, i]))) print("misguess of '" + UNIQUE_TAGS[i] + "' on Bidirection: " + str(1 - bi_matrix[i, i] / sum(bi_matrix[:, i])))
def test_ex5(): tagged_words = brown.tagged_words(categories='news', tagset='universal') (emission_FD, top_NN, emission_PD, p_NN, p_DT) = ex3(tagged_words) tagged_sentences = brown.tagged_sents(categories='news', tagset='universal') (transition_FD, transition_PD, p_VBD_NN, p_DT_NN) = ex4(tagged_sentences) states = list( set(pos for ( word, pos) in brown.tagged_words(categories='news', tagset='universal'))) sentence = [tp[0] for tp in tagged_sentences[42]] tag_sequence = viterbi(sentence, states, emission_PD, transition_PD) print "Viterbi tag sequence:" + ' '.join(tag_sequence) print "Gold tag sequence:" + ' '.join( [tp[1] for tp in tagged_sentences[42]])
def exercise3c(category): print print "For category: " +category brown_tag_words = bn.tagged_words(categories = category) tag_fd = nltk.FreqDist(t for (w,t) in brown_tag_words) print tag_fd.keys()[:10] print
def ch05_20_brown_corpus_words_phrases_by_tag(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") # produce alpha sorted list of distinct words tagged MD print sorted(set([w.lower() for (w,t) in filter(lambda (w,t): t == "MD", tagged_words)])) # identify words that can be plural (NRS, NPS*, NNS*) or # third person singular verbs (BEDZ*, BEZ*, DOZ*, *BEZ) # AND the ones ending with "s" print set([w for (w, t) in tagged_words if w.lower().endswith("s") and (t == "NRS" or t.startswith("NPS") or t.startswith("NPS") or t.startswith("NNS") or t.startswith("BEDZ") or t.startswith("BEZ") or t.startswith("DOZ") or t.endswith("BEZ"))]) # identify 3 word prepositional phrases IN+DET+NN tagged_word_trigrams = nltk.trigrams(tagged_words) print tagged_word_trigrams[:10] print set([" ".join([w1, w2, w3]) for (w1,t1), (w2,t2), (w3,t3) in tagged_word_trigrams if t1 == "IN" and t2 == "DET" and t3 == "NN"]) # ratio of masculine to feminine pronouns num_masc_pn = len([w for (w,t) in tagged_words if w.lower() == "he"]) num_fem_pn = len([w for (w,t) in tagged_words if w.lower() == "she"]) print "masc/fem = ", (num_masc_pn / num_fem_pn)
def partb(): print print tags = [b for (a, b) in bn.tagged_words()] fd = nltk.FreqDist(tags) ft = fd.keys() cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words()) for a in ft: if fd[a] == 1: print "For POS: " +a print cfd2[a].keys() print print print
def ch05_21_qualifiers_before_adore_love_like_prefer(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") tagged_word_bigrams = nltk.bigrams(tagged_words) allp = set(["adore", "love", "like", "prefer"]) print set([w for (w1,t1), (w2,t2) in tagged_word_bigrams if t1 == "QL" and w2.lower() in allp])
def lookupTagger(): fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:100] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags) baseline_tagger.evaluate(brown_tagged_sents) sent = brown.sents(categories='news')[3] baseline_tagger.tag(sent) baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN')) def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news')) def display(): import pylab words_by_freq = list(nltk.FreqDist(brown.words(categories='news'))) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()
def ch05_34_num_words_with_1to10_distinct_tags(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") # number of distinct tags and number of words in corpus for this dd = nltk.defaultdict(set) for w,t in tagged_words: dd[w].add(t) for i in range(1,10): print i, len(filter(lambda x: len(dd[x]) == i, dd.keys())) # for the word with greatest number of tags, print out concordance # one for each tag maxtags = 6 word = None tags = None for w in dd.keys(): if len(dd[w]) >= maxtags: word = w tags = dd[w] break poss = [] pos = 0 for w, t in tagged_words: if w == word and t in tags: poss.append((t, pos)) tags.remove(t) pos += 1 for t, pos in poss: print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
def nltk_simplify_brown_tag(): ''' Produces ~36 POS tags ''' other_pos_tagging = set( [sbt(tag) for (_, tag) in brown.tagged_words()[:600000]]) print len(other_pos_tagging) print other_pos_tagging return other_pos_tagging
def MostLikelyTag(word): print word text=brown.tagged_words() saved={} """Actually collect all the seen tags""" for w in text: if w[0].lower() == word.lower(): #check all these tag=w[1] try: saved[tag] = saved[tag] + 1 #aka if this tag has already been seen except KeyError: saved[tag] = 1 #if this is the first time we've seen the tag """Now find the one seen most often""" maxnum = 0 maxtag = None print saved.keys() for t in saved.keys(): if maxnum == 0: maxnum = saved[t] maxtag = t else: if saved[t] > maxnum: maxnum = saved[t] maxtag = t print 'maxtag',maxtag if maxtag==None: if word == "n't": return '*' #for some reason it wasn't tagging this correctly so i added it manually else: return 'UNK' #unknown words! else: return maxtag
def pre_processing(): global tag_count global tag_set global modified_tagged_sents global min_tag_count # counting no of occurences of each tag print "__________________counting occurences of tags_____________________" for (word, tag) in brown.tagged_words(): tag_count[tag] += 1 for key in tag_count: tag_set.append(key) # if count[key] is less then 10 put it as NONE print "_______________changing low occuring tags to NONE___________________" for sent in brown.tagged_sents(): modified_sent = [] for index, word in enumerate(sent): tag = word[1] if tag_count[word[1]] <= min_tag_count: tag = 'NONE' modified_sent.append([word[0], tag]) modified_tagged_sents.append(modified_sent) print "___________________creating tag_set & tag_count_____________________" tag_count = defaultdict(int) tag_set = [] for sent in modified_tagged_sents: for word in sent: tag = word[1] tag_count[tag] += 1 for key in tag_count: tag_set.append(key)
def ch05_11_train_test_affix_tagger(): from nltk.corpus import brown fd = nltk.FreqDist(brown.words(categories="news")) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news")) most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys()) affix_tagger = nltk.AffixTagger(model=most_freq_pos) print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
def word_count(): from nltk.corpus import brown counts = nltk.defaultdict(int) for (word, tag) in brown.tagged_words(categories="news"): counts[tag] += 1 from operator import itemgetter print sorted(counts.items(), key=itemgetter(1), reverse=True)
def ch05_15_brown_corpus_trivia(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") # which nouns are more common in plural form than singular? # NNS - plural, NN - singular. Calculate plural = singular + s s_nouns = [w for (w, t) in tagged_words if t == "NN"] plurals = set([w + "s" for w in s_nouns]) p_nouns = [w for (w, t) in tagged_words if t == "NNS" and w in plurals] s_fd = nltk.FreqDist(s_nouns) p_fd = nltk.FreqDist(p_nouns) print "words where singular > plural=", \ filter(lambda word: s_fd[word] < p_fd[word], p_fd.keys())[:50] # which word has the greatest number of distinct tags word_tags = nltk.defaultdict(lambda: set()) for word, token in tagged_words: word_tags[word].add(token) ambig_words = sorted([(k, len(v)) for (k, v) in word_tags.items()], key=itemgetter(1), reverse=True)[:50] print[(word, numtoks, word_tags[word]) for (word, numtoks) in ambig_words] # list top 20 (by frequency) tags token_fd = nltk.FreqDist([token for (word, token) in tagged_words]) print "top_tokens=", token_fd.keys()[:20] # which tags are nouns most commonly found after tagged_word_bigrams = nltk.bigrams(tagged_words) fd_an = nltk.FreqDist([ t1 for (w1, t1), (w2, t2) in tagged_word_bigrams if t2.startswith("NN") ]) print "nouns commonly found after these tags:", fd_an.keys()
def ch05_34_num_words_with_1to10_distinct_tags(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") # number of distinct tags and number of words in corpus for this dd = nltk.defaultdict(set) for w, t in tagged_words: dd[w].add(t) for i in range(1, 10): print i, len(filter(lambda x: len(dd[x]) == i, dd.keys())) # for the word with greatest number of tags, print out concordance # one for each tag maxtags = 6 word = None tags = None for w in dd.keys(): if len(dd[w]) >= maxtags: word = w tags = dd[w] break poss = [] pos = 0 for w, t in tagged_words: if w == word and t in tags: poss.append((t, pos)) tags.remove(t) pos += 1 for t, pos in poss: print t, " ".join(w for w, t in tagged_words[pos - 10:pos + 10])
def construct_dataset(nvocab): dataset = [(a.lower(), b) for a, b in brown.tagged_words(tagset='universal') if a not in '!"#$%&()*+-,./:;<=>?@[\\]^_`{|}~\t\n'] word_list, tag_list = zip(*dataset) word_set = set(word_list) tag_set = set(tag_list) tag_dict = {tag: i for i, tag in enumerate(tag_set)} c = Counter(word_list) c = dict(c.most_common(nvocab)) inv_c = [(count, word) for word, count in c.items()] inv_c = sorted(inv_c, reverse=True) _, sorted_words = zip(*inv_c) word_dict = {word: i for i, word in enumerate(sorted_words)} result = [(word_dict[word], tag_dict[tag]) for word, tag in dataset if word in c] if __debug__: print('Num Words : {}, Total Words : {}'.format( len(word_set), len(word_list))) return result
def ch05_18_brown_corpus_statistics(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") vocab_size = len(set([w for (w, t) in tagged_words])) cfd = nltk.ConditionalFreqDist(tagged_words) # proportion of word types always assigned the same part-of-speech # ie words with a single POS num_single_pos_words = sum( len(cfd[word].hapaxes()) for word in cfd.conditions()) print "prop of word types with single POS=", \ num_single_pos_words / vocab_size # how many words are ambiguous, ie with >= 2 POS tags ambig_words = [ w for w in cfd.conditions() if len(filter(lambda x: cfd[w][x] >= 2, cfd[w].keys())) >= 2 ] num_ambig_words = len(ambig_words) print "prop of ambiguous words (>= 2 POS)=", \ num_ambig_words / vocab_size # percentage of word tokens in the brown corpus that involve # ambiguous words token_size = len(set([t for (w, t) in tagged_words])) unique_tokens = set() for w in ambig_words: unique_tokens.update(set([t for t in cfd[w].keys()])) print "prop of ambig tokens=", len(unique_tokens) / token_size
def verb_stem(s): """extracts the stem from the 3sg form of a verb, or returns empty string""" ok = 0 if (re.match("\w*([^aeiousxyzh]|[^cs]h)s$", s)): stem = s[:-1] elif (re.match("(\w*)[aeiou]ys$", s)): stem = s[:-1] elif (re.match("\w+[^aeiou]ies$", s)): stem = s[:-3] + 'y' elif (re.match("[^aeiou]ies$", s)): stem = s[:-1] elif (re.match("\w*([ox]|ch|sh|ss|zz)es$", s)): stem = s[:-2] elif (re.match("\w*(([^s]se)|([^z]ze))s$", s)): stem = s[:-1] elif (re.match("has", s)): stem = "have" elif (re.match("\w*([^iosxzh]|[^cs]h)es$", s)): stem = s[:-1] else: stem = "" if (stem != "" and ok != 1): for (word, tag) in brown.tagged_words(): if word == stem and tag in ('VB', 'VBZ'): return stem ok = 1 break if (ok == 0): return ""
def ch05_20_brown_corpus_words_phrases_by_tag(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") # produce alpha sorted list of distinct words tagged MD print sorted( set([ w.lower() for (w, t) in filter(lambda (w, t): t == "MD", tagged_words) ])) # identify words that can be plural (NRS, NPS*, NNS*) or # third person singular verbs (BEDZ*, BEZ*, DOZ*, *BEZ) # AND the ones ending with "s" print set([ w for (w, t) in tagged_words if w.lower().endswith("s") and ( t == "NRS" or t.startswith("NPS") or t.startswith("NPS") or t.startswith("NNS") or t.startswith("BEDZ") or t.startswith("BEZ") or t.startswith("DOZ") or t.endswith("BEZ")) ]) # identify 3 word prepositional phrases IN+DET+NN tagged_word_trigrams = nltk.trigrams(tagged_words) print tagged_word_trigrams[:10] print set([ " ".join([w1, w2, w3]) for (w1, t1), (w2, t2), (w3, t3) in tagged_word_trigrams if t1 == "IN" and t2 == "DET" and t3 == "NN" ]) # ratio of masculine to feminine pronouns num_masc_pn = len([w for (w, t) in tagged_words if w.lower() == "he"]) num_fem_pn = len([w for (w, t) in tagged_words if w.lower() == "she"]) print "masc/fem = ", (num_masc_pn / num_fem_pn)
def lookupTagger(): fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:100] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags) baseline_tagger.evaluate(brown_tagged_sents) sent = brown.sents(categories='news')[3] baseline_tagger.tag(sent) baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN')) def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news')) def display(): import pylab words_by_freq = list(nltk.FreqDist(brown.words(categories='news'))) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) sizes = 2**pylab.arange(15) perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()
def run(self, model): print('Testing...') perplexity = 1 genres = [ 'adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction' ] total = 0 for i, genre in enumerate(genres): print(repr(i + 1) + '/' + repr(len(genres))) corpus = brown.tagged_words(categories=genre) size = int(len(corpus) * 0.90) corpus = corpus[size:] trigrams = nltk.trigrams(corpus) for ((word2, tag2), (word1, tag1), (word0, tag0)) in trigrams: total += 1 score = model.get_score(word2, tag2, word1, tag1, word0, tag0) perplexity += math.log(score, 2) perplexity = perplexity / total perplexity = math.pow(2, -perplexity) print(perplexity)
def partOfSpeechTagging(): from nltk.corpus import brown suffix_fdist = nltk.FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n,g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.DecisionTreeClassifier.train(train_set) nltk.classify.accuracy(classifier, test_set) classifier.classify(pos_features('cats')) print classifier.pseudocode(depth=4)
def question2(category): #print #print "For Category: " + category #print "Words with the tag 'JJ':" #print words = bn.tagged_words(categories = category) wordlist = bn.words(categories = category) words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ'])) print len(words_JJ) print print print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:" print words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS'] print words_VBP_NNPS_NNS[:10] print print print "Ratio" print male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b' female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b' male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())]) female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())]) print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns) print print sent = "" print "3 word prepositional phrases are:"
def prepare_data_sentences(training_size): ## Take a subset brown_words = list(itertools.islice(brown.words(), training_size)) brown_tags = [pair[1] for pair in brown.tagged_words(tagset='universal')] word_encoder = sklearn.preprocessing.LabelEncoder() pos_encoder = sklearn.preprocessing.LabelEncoder() brown_words_num = word_encoder.fit_transform(brown_words) brown_tags_num = pos_encoder.fit_transform(brown_tags) x_data_sents, y_data_sents = [], [] x_data_sent, y_data_sent = [], [] dot_label = word_encoder.transform(['.'])[0] dot_label_tags = pos_encoder.transform(['.'])[0] #split on sentences for word, tag in zip(brown_words_num, brown_tags_num): if word == dot_label and tag == dot_label_tags: if len(x_data_sent) > 0: x_data_sents.append(x_data_sent) y_data_sents.append(y_data_sent) x_data_sent, y_data_sent = [], [] x_data_sent.append(word) y_data_sent.append(tag) input_dim = len(word_encoder.classes_) output_dim = len(pos_encoder.classes_) return input_dim, output_dim, x_data_sents, y_data_sents
class data: content_words = ["main", "content", "body"] tags = [ "<!DOCTYPE>", "<a>", "<abbr>", "<acronym>", "<address>", "<applet>", "<area>", "<article>", "<aside>", "<audio>", "<b>", "<base>", "<basefont>", "<bdi>", "<bdo>", "<big>", "<blockquote>", "<body>", "<br>", "<button>", "<canvas>", "<caption>", "<center>", "<cite>", "<code>", "<col>", "<colgroup>", "<datalist>", "<dd>", "<del>", "<details>", "<dfn>", "<dialog>", "<dir>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<fieldset>", "<figcaption>", "<figure>", "<font>", "<footer>", "<form>", "<frame>", "<frameset>", "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>", "<head>", "<header>", "<hr>", "<html>", "<i>", "<iframe>", "<img>", "<input>", "<ins>", "<kbd>", "<keygen>", "<label>", "<legend>", "<li>", "<link>", "<main>", "<map>", "<mark>", "<menu>", "<menuitem>", "<meta>", "<meter>", "<nav>", "<noframes>", "<noscript>", "<object>", "<ol>", "<optgroup>", "<option>", "<output>", "<p>", "<param>", "<picture>", "<pre>", "<progress>", "<q>", "<rp>", "<rt>", "<ruby>", "<s>", "<samp>", "<script>", "<section>", "<select>", "<small>", "<source>", "<span>", "<strike>", "<strong>", "<style>", "<sub>", "<summary>", "<sup>", "<table>", "<tbody>", "<td>", "<textarea>", "<tfoot>", "<th>", "<thead>", "<time>", "<title>", "<tr>", "<track>", "<tt>", "<u>", "<ul>", "<var>", "<video>", "<wbr>" ] brown_words = dict(brown.tagged_words())
def main(): tagged_words = brown.tagged_words() words_corpus = brown.words() word2vec = Word2Vec() word2vec.train(words_corpus) word_vecs = [word2vec.word2vec(word) for word in words_corpus] n_clusters = 10 # random number for now kmeans = KMeans(n_clusters) kmeans.compute(word_vecs) # word-cluster HMM p_word = {} p_cluster = {} p_cluster_given_word = None # softmax p_word_given_cluster = None # joint probability formula p_transition_cluster = None # count p_initial_cluster = None # count # cluster-tag HMM p_cluster_given_tag = None # softmax p_transition_tag = None # count from tagged data p_initial_tag = None # count from tagged data hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster) hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag) words = [] clusters = hmm_word_cluster.viterbi(words) tags = hmm_cluster_tag.viterbi(clusters)
def corpusBigrams(): from nltk.corpus import brown corpus = brown.tagged_words() bigrams = [] corpLen = len(corpus) for i in range(1, len(corpus)): tempStr = str(i) + "/" + str(corpLen) print tempStr tally = 1 newBigram = [corpus[i - 1], corpus[i], tally] bigrams.append(newBigram) print("sorting") sortedBigrams = sorted(bigrams, key=itemgetter(0, 1)) toReturn = [] bigramToMatch = sortedBigrams[0] tally = 1 listLen = len(sortedBigrams) print len(sortedBigrams) for i in range(1, len(sortedBigrams)): tempStr = str(i) + "/" + str(listLen) print(tempStr) if sortedBigrams[i] == bigramToMatch: tally += 1 else: temp = bigramToMatch temp[2] = tally toReturn += [temp] bigramToMatch = sortedBigrams[i] tally = 1 file1 = open('bigramGrammar.txt', 'w') for b in toReturn: file1.write(str(b) + "\n") file1.close() return bigrams
def rate(self): wor_tag = brown.tagged_words(fileids=['ca01']) sentence = [] anse = [] pres = 0 presb = 0 for word in wor_tag: if (word[0] == "."): ans = self.hmm(sentence) #print(ans) for i in range(ans.__len__()): str = [] str = anse[i].split("+") str = anse[i].split("-") #print(anse[i]) #print(ans[i]) if ans[i] in str: pres = pres + 1 presb += 1 else: pres += 1 #最重要要修改的地方 sentence = [] anse = [] print(presb / pres) pres = 0 presb = 0 break else: #print("hello") sentence.append(word[0]) anse.append(word[1])
def exercise3(): cfd = nltk.ConditionalFreqDist( (word.lower(), tag) for genre in brown.categories() for (word, tag) in brown.tagged_words(categories=genre) ) result = {'part1':{}, 'part2' : {} } print("Part 1") for word in sorted(cfd.conditions()): tags = set(cfd[word]) if len(tags) == 5: if word not in result['part1']: result['part1'][word] = tags print("Number of words which has exactly 5 possible tags : ", len(result['part1'])) print("words which has exactly 5 possible tags : ", result['part1']) print("Part 2") possible_tags = ['CS', 'WPS', 'DT', 'QL', 'NIL'] distinct_word = 'that' print(" the distinct word is : ", distinct_word) for sentence in brown.tagged_sents(): for tuple in sentence: if tuple[0] == distinct_word and len(possible_tags) > 0: if tuple[1] == possible_tags[0]: print("Sentence : ", " ".join([w for (w, t) in sentence])) if len(possible_tags) > 0: possible_tags.remove(possible_tags[0])
def verb_stem(s): """extracts the stem from the 3sg form of a verb, or returns empty string""" ok = 0 if (re.match("\w*([^aeiousxyzh]|[^cs]h)s$", s)): stem = s[:-1] elif (re.match("(\w*)[aeiou]ys$", s)): stem = s[:-1] elif (re.match("\w+[^aeiou]ies$", s)): stem = s[:-3]+'y' elif (re.match("[^aeiou]ies$", s)): stem = s[:-1] elif (re.match("\w*([ox]|ch|sh|ss|zz)es$", s)): stem = s[:-2] elif (re.match("\w*(([^s]se)|([^z]ze))s$", s)): stem = s[:-1] elif (re.match("has", s)): stem = "have" elif (re.match("\w*([^iosxzh]|[^cs]h)es$", s)): stem = s[:-1] else: stem = "" if (stem != "" and ok != 1): for (word, tag) in brown.tagged_words(): if word == stem and tag in ('VB', 'VBZ'): return stem ok = 1 break if (ok == 0): return ""
def analysis_using_word_and_prev_pos(): from nltk.corpus import brown pos = nltk.defaultdict(lambda: nltk.defaultdict(int)) brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged): pos[(t1, w2)][t2] += 1 print pos[("DET", "right")]
def ch05_33_list_pos_of_word_given_word_and_pos(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") tagged_word_bigrams = nltk.bigrams(tagged_words) dd = nltk.defaultdict(dict) for (w1,t1), (w2,t2) in tagged_word_bigrams: dd[w1][t1] = t2 print dd
def retag_brown_words(tag_map): wordpos_fd = nltk.FreqDist() for word, tag in brown.tagged_words(): if tag_map.has_key(tag): normed_pos = tag_map[tag] retagged_word = DELIM.join([word.lower(), normed_pos]) wordpos_fd.inc(retagged_word) return wordpos_fd
def main(): brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True) tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged) print tag_fd.keys() print print tag_fd print
def setup(self): for tag in brown.tagged_words(categories='news'): if tag[1] == 'NN': self.nouns.append(tag[0]) elif tag[1] == 'VB': self.verbs.append(tag[0]) elif tag[1] == 'JJ': self.adjs.append(tag[0])
def lookup_tagger(): fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:100] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) #return nltk.UnigramTagger(model=likely_tags) return nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN'))
def complexDict(): pos = nltk.defaultdict(lambda: nltk.defaultdict(int)) brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) for ((w1, t1), (w2, t2)) in nltk.ibigrams(brown_news_tagged): pos[(t1, w2)][t2] += 1 pos[("DET", "right")]
def ch05_35_must_contexts(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") tagged_word_bigrams = nltk.bigrams(tagged_words) fd = nltk.FreqDist((w1,t2) for (w1,t1),(w2,t2) in tagged_word_bigrams if w1 == "must") for t in fd.keys(): print t, fd[t]
def importingBrownCorpusFromNLTK(outF): "importing tagged brown corpus from NLTK and writing on a file OutF" outF = open(outF,'w') from nltk.corpus import brown brown_news_tagged = brown.tagged_words(categories='news',simplify_tags=True) print 'size', len(brown_news_tagged) for i in brown_news_tagged: outF.write(i[0]+'\t'+i[1]+'\n') outF.close()
def find_highly_ambiguous_words(): from nltk.corpus import brown brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) cfd = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged) for word in cfd.conditions(): if len(cfd[word]) > 3: tags = cfd[word].keys() print word, ":", " ".join(tags)
def category_by_pos(): from nltk.corpus import brown from nltk import FreqDist from nltk import DecisionTreeClassifier from nltk import NaiveBayesClassifier from nltk import classify suffix_fdist = FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] # print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = DecisionTreeClassifier.train(train_set) # print 'Decision Tree %f' % classify.accuracy(classifier, test_set) classifier = NaiveBayesClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)