def exploreTaggedCorpora(): brown_learned_text = brown.words(categories='learned') sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often')) brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often'] fd = nltk.FreqDist(tags) fd.tabulate() def process(sentence): for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence): if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')): print w1, w2, w3 for tagged_sent in brown.tagged_sents(): process(tagged_sent) brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True) data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged) for word in data.conditions(): if len(data[word]) > 3: tags = data[word].keys() print word, ' '.join(tags)
def exploreTaggedCorpora(): brown_learned_text = brown.words(categories="learned") sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often")) brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"] fd = nltk.FreqDist(tags) fd.tabulate() def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if t1.startswith("V") and t2 == "TO" and t3.startswith("V"): print w1, w2, w3 for tagged_sent in brown.tagged_sents(): process(tagged_sent) brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged) for word in data.conditions(): if len(data[word]) > 3: tags = data[word].keys() print word, " ".join(tags)
def bgram_by_word(words, rel_word): """ Function for returning a bigram for a particular word from a tagged corpus of words """ return list( set(word2.lower() for (word1, word2) in nltk.ibigrams(words) if word1 == rel_word))
def nextags(tagged, word): """ Add the tag of the next word to our list of tags. """ tags = [] for ((word1, tgram1), (word2, tgram2)) in nltk.ibigrams(tagged): if word1 == word: tags.append(tgram2) return list(set(tags))
def tokenize_and_ngram(text): tokens = [x.lower() for x in twtokenize.word_tokenize(text)] # Split into sublists using stopwords. This keeps us from # generating n-grams from words that aren't actually next # to each other. This step also removes the stopwords tokens = list(splitlist(tokens, stopwords)) bgrams = [nltk.ibigrams(subtokens) for subtokens in tokens] tgrams = [nltk.itrigrams(subtokens) for subtokens in tokens] terms = list(chain(*(tokens + bgrams + tgrams))) return terms
def exercise1(): print("Part a") #Which nouns are more common in their plural form, rather than their singular form #(Use the parts of speech tags in the corpus to identify plural versus singular nouns # and use nltk.WordNetLemmatizer() to get the singular form of a noun from its plural form). # List the five most frequent nouns that feature this property. brown_words = brown.words() brown_tags = brown.tagged_words() tagdict = findtags( 'NNS', brown_tags) # a dictionary with a list of words have 'NNS' tag lem = nltk.WordNetLemmatizer() words = [] cf = nltk.FreqDist(brown_words) #print brown_words[:50] tag = 'NNS' for plural in tagdict[tag]: singular = lem.lemmatize(plural) #print plural, singular freq_sing = cf[singular] freq_plur = cf[plural] if freq_plur > freq_sing: words.append(plural) words.sort(key=lambda a: cf[a], reverse=True) print tag, "5 plural nouns more common:", words[: 5] # last elements #print cf['years'], cf['yaks'] print("Part b") # List the 5 most frequent tags in order of decreasing frequency. What do the tags represent? tags = [b[1] for (a, b) in nltk.ibigrams(brown_tags)] fd = nltk.FreqDist(tags) print fd.keys()[:10] print fd.values()[:10] i = 0 for i in range(0, 5): print fd.keys()[i], fd.values()[i] #print " 'NN' is Noun, 'AT' prepositions, 'IN' prepositions, 'JJ' adverb" #for key in fd.keys(): #if key != '.' or key != ',': #print key, fd.values()[counter] #counter = counter + 1 #if counter > 5: #break print("The tags represent the decrease in frequency.") print("Part c") # Which three tags precede nouns tagged with the 'NN' tag most commonly? What do these three tags represent? # Report your findings separately for the following categories of Brown corpus: humor, romance, government. categories = ['humor', 'romance', 'government'] for category in categories: #tagged_dict = findtags('NN', brown.tagged_words(categories=category)) category_tags = brown.tagged_words(categories=category) tagList = [ b[1] for (a, b) in nltk.ibigrams(category_tags) if b[1].startswith('N') and b[1] != 'N' ] #print category, tagList[:20] fd = nltk.FreqDist(tagList) print category, ', '.join(fd.keys()[:3])
import nltk from nltk.corpus import brown # brown_learned_text = brown.words(categories='learned') # brown_sorted = sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often')) brown_learned_text_tagged = brown.tagged_words(categories='learned', simplify_tags=True) ## b[0] is the word after a ## b[1] is part of speech of the word after a ## a[0] is, in this case, 'often' ## a[1] is, in this case, the part of speech of 'often' tags = [b[1] for (a, b) in nltk.ibigrams(brown_learned_text_tagged) if a[0] == 'often'] freq_dist = nltk.FreqDist(tags) freq_dist_tab = freq_dist.tabulate() print freq_dist_tab
def nextags(tagged, word): tags = [] for ((w1, t1), (w2, t2)) in nltk.ibigrams(tagged): if w1 == word: tags.append(t2) return list(set(tags))
def ibgram_by_word(words,rel_word): return list(set(word2.lower() for (word1, word2) in nltk.ibigrams(words) if word1 == rel_word))
training_data, test_data = text[:cut], text[cut:] #sort words in a string by their length words = 'I turned off the spedsdsd'.split() wordlens = [(len(word),word) for word in words] wordlens.sort() ' '.join(w for (_, w) in wordlens) #parameters checking assert isinstance(word, basestring), "argument must be a string" #weird function bug on page 157 b is c b == c #Chapter 5 nltk.pos_tag(text) #simplified POS tagset on page 183 sorted(set(b for (a,b) in nltk.ibigrams(brown_learned_text) if a == 'often')) tags = [b[1] for (a, b) in nltk.ibigrams(brown_Irnd_tagged) if a[0] == 'often'] fd = ntlk.FreqDist(tags) pos.keys() pos.values() pos = nltk.defaultdict(lambda: 'N') sorted(counts.items(), key=itemgetter(1), reverse=True) #inverting a dictionary pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V'} pos2 = dict((value, key) for (key, value) in pos.items()) #dictionary methods summary is on page 198 bigram_tagger = nltk.BigramTagger(train_sents) bigram_tagger.tag(brown_sents[2007]) #tagging: default tagger, regex tagger, unigram tagger and n-gram tagger #we can set backoff to combine taggers
import nltk from nltk.corpus import brown # brown_learned_text = brown.words(categories='learned') # brown_sorted = sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often')) brown_learned_text_tagged = brown.tagged_words(categories='learned', simplify_tags=True) ## b[0] is the word after a ## b[1] is part of speech of the word after a ## a[0] is, in this case, 'often' ## a[1] is, in this case, the part of speech of 'often' tags = [ b[1] for (a, b) in nltk.ibigrams(brown_learned_text_tagged) if a[0] == 'often' ] freq_dist = nltk.FreqDist(tags) freq_dist_tab = freq_dist.tabulate() print freq_dist_tab
def bgram_by_word(words, rel_word): """ Function for returning a bigram for a particular word from a tagged corpus of words """ return list(set(word2.lower() for (word1, word2) in nltk.ibigrams(words) if word1 == rel_word))
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj) cfd2['VN'].keys() # pg. 187 def findtags(tag_prefix, tagged_text): cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)) return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions()) tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news')) for tag in sorted(tagdict): print tag, tagdict[tag] from nltk.corpus import brown brown_learned_text = brown.words(categories='learned') sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often')) # pg. 188 brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often'] fd = nltk.FreqDist(tags) fd.tabulate() from nltk.corpus import brown def process(sentence): for (w1, t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence): if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')): print w1, w2, w3 for tagged_sent in brown.tagged_sents():
# pg. 187 def findtags(tag_prefix, tagged_text): cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)) return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions()) tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news')) for tag in sorted(tagdict): print tag, tagdict[tag] from nltk.corpus import brown brown_learned_text = brown.words(categories='learned') sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often')) # pg. 188 brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often'] fd = nltk.FreqDist(tags) fd.tabulate() from nltk.corpus import brown def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
word_tag_fd = nltk.FreqDist(wsj) idx1 = wsj.index(("kicked", "VD")) def findtags(tag_prefix, tagged_text): cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)) return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions()) tagdict = findtags("NN", nltk.corpus.brown.tagged_words(categories="news")) for tag in sorted(tagdict): print tag, tagdict[tag] brown_learned_text = brown.words(categories="learned") sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often")) brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"] fd = nltk.FreqDist(tags) fd.tabulate() def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if t1.startswith("V") and t2 == "TO" and t3.startswith("V"): print w1, w2, w3 for tagged_sent in brown.tagged_sents(): process(tagged_sent)
def bgram_by_word(words, rel_word): return list(set(w2.lower() for (w1, w2) in nltk.ibigrams(words) if w1 == rel_word))