Exemplo n.º 1
0
def exploreTaggedCorpora():

    brown_learned_text = brown.words(categories='learned')
    sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))

    brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True)
    tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often']
    fd = nltk.FreqDist(tags)
    fd.tabulate()


    def process(sentence):
        for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence): 
            if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
                print w1, w2, w3 


    for tagged_sent in brown.tagged_sents():
        process(tagged_sent)


    brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True)
    data = nltk.ConditionalFreqDist((word.lower(), tag)
            for (word, tag) in brown_news_tagged)

    for word in data.conditions():
        if len(data[word]) > 3:
            tags = data[word].keys()
            print word, ' '.join(tags)
Exemplo n.º 2
0
def exploreTaggedCorpora():

    brown_learned_text = brown.words(categories="learned")
    sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))

    brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
    tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
    fd = nltk.FreqDist(tags)
    fd.tabulate()

    def process(sentence):
        for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
            if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
                print w1, w2, w3

    for tagged_sent in brown.tagged_sents():
        process(tagged_sent)

    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)

    for word in data.conditions():
        if len(data[word]) > 3:
            tags = data[word].keys()
            print word, " ".join(tags)
Exemplo n.º 3
0
def bgram_by_word(words, rel_word):
    """
    Function for returning a bigram for a particular word from a tagged corpus
    of words
    """
    return list(
        set(word2.lower() for (word1, word2) in nltk.ibigrams(words)
            if word1 == rel_word))
Exemplo n.º 4
0
def nextags(tagged, word):
    """
    Add the tag of the next word to our list of tags.
    """
    tags = []
    for ((word1, tgram1), (word2, tgram2)) in nltk.ibigrams(tagged):
        if word1 == word:
            tags.append(tgram2)
    return list(set(tags))
Exemplo n.º 5
0
def nextags(tagged, word):
    """
    Add the tag of the next word to our list of tags.
    """
    tags = []
    for ((word1, tgram1), (word2, tgram2)) in nltk.ibigrams(tagged):
        if word1 == word:
            tags.append(tgram2)
    return list(set(tags))
Exemplo n.º 6
0
def tokenize_and_ngram(text):
    tokens = [x.lower() for x in twtokenize.word_tokenize(text)]
    # Split into sublists using stopwords. This keeps us from
    # generating n-grams from words that aren't actually next
    # to each other. This step also removes the stopwords
    tokens = list(splitlist(tokens, stopwords))
    bgrams = [nltk.ibigrams(subtokens) for subtokens in tokens]
    tgrams = [nltk.itrigrams(subtokens) for subtokens in tokens]
    terms = list(chain(*(tokens + bgrams + tgrams)))
    return terms
def exercise1():
    print("Part a")
    #Which nouns are more common in their plural form, rather than their singular form
    #(Use the parts of speech tags in the corpus to identify plural versus singular nouns
    # and use nltk.WordNetLemmatizer() to get the singular form of a noun from its plural form).
    # List the five most frequent nouns that feature this property.
    brown_words = brown.words()
    brown_tags = brown.tagged_words()
    tagdict = findtags(
        'NNS', brown_tags)  # a dictionary with a list of words have 'NNS' tag

    lem = nltk.WordNetLemmatizer()
    words = []
    cf = nltk.FreqDist(brown_words)
    #print brown_words[:50]

    tag = 'NNS'
    for plural in tagdict[tag]:
        singular = lem.lemmatize(plural)
        #print plural, singular
        freq_sing = cf[singular]
        freq_plur = cf[plural]
        if freq_plur > freq_sing:
            words.append(plural)
            words.sort(key=lambda a: cf[a], reverse=True)
            print tag, "5 plural nouns more common:", words[:
                                                            5]  # last elements
    #print cf['years'], cf['yaks']

    print("Part b")
    # List the 5 most frequent tags in order of decreasing frequency. What do the tags represent?
    tags = [b[1] for (a, b) in nltk.ibigrams(brown_tags)]
    fd = nltk.FreqDist(tags)
    print fd.keys()[:10]
    print fd.values()[:10]
    i = 0
    for i in range(0, 5):
        print fd.keys()[i], fd.values()[i]
    #print " 'NN' is Noun, 'AT' prepositions, 'IN' prepositions, 'JJ' adverb"
    #for key in fd.keys():
    #if key != '.' or key != ',':
    #print key, fd.values()[counter]
    #counter = counter + 1
    #if counter > 5:
    #break

    print("The tags represent the decrease in frequency.")

    print("Part c")
    # Which three tags precede nouns tagged with the 'NN' tag most commonly? What do these three tags represent?
    # Report your findings separately for the following categories of Brown corpus: humor, romance, government.
    categories = ['humor', 'romance', 'government']

    for category in categories:

        #tagged_dict = findtags('NN', brown.tagged_words(categories=category))
        category_tags = brown.tagged_words(categories=category)
        tagList = [
            b[1] for (a, b) in nltk.ibigrams(category_tags)
            if b[1].startswith('N') and b[1] != 'N'
        ]
        #print category, tagList[:20]
        fd = nltk.FreqDist(tagList)
        print category, ', '.join(fd.keys()[:3])
Exemplo n.º 8
0
import nltk
from nltk.corpus import brown

# brown_learned_text = brown.words(categories='learned')
# brown_sorted = sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))

brown_learned_text_tagged = brown.tagged_words(categories='learned', simplify_tags=True)
## b[0] is the word after a
## b[1] is part of speech of the word after a
## a[0] is, in this case, 'often'
## a[1] is, in this case, the part of speech of 'often'
tags = [b[1] for (a, b) in nltk.ibigrams(brown_learned_text_tagged) if a[0] == 'often']

freq_dist = nltk.FreqDist(tags)
freq_dist_tab = freq_dist.tabulate()

print freq_dist_tab
Exemplo n.º 9
0
def nextags(tagged, word):
    tags = []
    for ((w1, t1), (w2, t2)) in nltk.ibigrams(tagged):
        if w1 == word:
            tags.append(t2)
    return list(set(tags))
Exemplo n.º 10
0
def ibgram_by_word(words,rel_word):
	return list(set(word2.lower() for (word1, word2) in nltk.ibigrams(words) if word1 == rel_word))
Exemplo n.º 11
0
training_data, test_data = text[:cut], text[cut:]
#sort words in a string by their length
words = 'I turned off the spedsdsd'.split()
wordlens = [(len(word),word) for word in words]
wordlens.sort()
' '.join(w for (_, w) in wordlens)
#parameters checking
assert isinstance(word, basestring), "argument must be a string"
#weird function bug on page 157
b is c
b == c

#Chapter 5
nltk.pos_tag(text)
#simplified POS tagset on page 183
sorted(set(b for (a,b) in nltk.ibigrams(brown_learned_text) if a == 'often'))
tags = [b[1] for (a, b) in nltk.ibigrams(brown_Irnd_tagged) if a[0] == 'often']
fd = ntlk.FreqDist(tags)
pos.keys()
pos.values()
pos = nltk.defaultdict(lambda: 'N')
sorted(counts.items(), key=itemgetter(1), reverse=True)
#inverting a dictionary
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V'}
pos2 = dict((value, key) for (key, value) in pos.items())
#dictionary methods summary is on page 198
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007])
#tagging: default tagger, regex tagger, unigram tagger and n-gram tagger
#we can set backoff to combine taggers
Exemplo n.º 12
0
import nltk
from nltk.corpus import brown

# brown_learned_text = brown.words(categories='learned')
# brown_sorted = sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))

brown_learned_text_tagged = brown.tagged_words(categories='learned',
                                               simplify_tags=True)
## b[0] is the word after a
## b[1] is part of speech of the word after a
## a[0] is, in this case, 'often'
## a[1] is, in this case, the part of speech of 'often'
tags = [
    b[1] for (a, b) in nltk.ibigrams(brown_learned_text_tagged)
    if a[0] == 'often'
]

freq_dist = nltk.FreqDist(tags)
freq_dist_tab = freq_dist.tabulate()

print freq_dist_tab
Exemplo n.º 13
0
def bgram_by_word(words, rel_word):
    """
    Function for returning a bigram for a particular word from a tagged corpus
    of words
    """
    return list(set(word2.lower() for (word1, word2) in nltk.ibigrams(words) if word1 == rel_word))
Exemplo n.º 14
0
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
cfd2['VN'].keys()

# pg. 187

def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions())

tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
for tag in sorted(tagdict):
	print tag, tagdict[tag] 

from nltk.corpus import brown
brown_learned_text = brown.words(categories='learned')
sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))

# pg. 188

brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True)
tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()

from nltk.corpus import brown
def process(sentence):
	for (w1, t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):
		if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
			print w1, w2, w3

for tagged_sent in brown.tagged_sents():
Exemplo n.º 15
0
# pg. 187


def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                   if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions())


tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
for tag in sorted(tagdict):
    print tag, tagdict[tag]

from nltk.corpus import brown
brown_learned_text = brown.words(categories='learned')
sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))

# pg. 188

brown_lrnd_tagged = brown.tagged_words(categories='learned',
                                       simplify_tags=True)
tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()

from nltk.corpus import brown


def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
Exemplo n.º 16
0
word_tag_fd = nltk.FreqDist(wsj)

idx1 = wsj.index(("kicked", "VD"))


def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions())


tagdict = findtags("NN", nltk.corpus.brown.tagged_words(categories="news"))
for tag in sorted(tagdict):
    print tag, tagdict[tag]

brown_learned_text = brown.words(categories="learned")
sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))

brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
fd = nltk.FreqDist(tags)
fd.tabulate()


def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
            print w1, w2, w3


for tagged_sent in brown.tagged_sents():
    process(tagged_sent)
Exemplo n.º 17
0
def bgram_by_word(words, rel_word):
    return list(set(w2.lower() for (w1, w2) in nltk.ibigrams(words) if w1 == rel_word))