Exemplo n.º 1
0
def chapter2_exercise4():
    # Read in the texts of the State of the Union addresses, using the state_union corpus reader.Count occurrences of
    # men, women, and people in each document.What has happened to the usage of these words over time?
    files = state_union.fileids()
    men = dict()
    women = dict()
    people = dict()
    for index, file in enumerate(files):
        words = sorted(state_union.words(fileids=[file]))
        men[file] = words.count("men")
        women[file] = words.count("women")
        people[file] = words.count("people")
        print(file[:4], men[file], women[file], people[file], end="      ")
        if index % 6 == 5:
            print()
    print("\nMEN")
    for file, men_c in men.items():
        print(file[:4], men_c)
    print("\nWOMEN")
    for file, women_c in women.items():
        print(file[:4], women_c)
    print("\nPERSON")
    for file, person_c in people.items():
        print(file[:4], person_c)
    print("men:", sum(men.values()))
    print("women:", sum(women.values()))
    print("people:", sum(people.values()))
Exemplo n.º 2
0
def state_union_men_stat():
    cfd = nltk.ConditionalFreqDist((target,year[:4])
        for year in state_union.fileids()
        for w in state_union.words(year)
        for target in ['men','women','people']
        if w.lower().startswith(target)
    )
    cfd.plot()
def all_documents():
    documents = []
    for document in state_union.fileids():
        text = ""
        for word in state_union.words(document):
            text = text + " " + word
        documents.append((text, extract_president(document)))
    return documents
def init():
    train = []
    test = []
    filenames = state_union.fileids()
    for i in range(0,len(filenames)):
        if (i % 2 == 0):
            train.append(filenames[i])
        else:
            test.append(filenames[i])    
    return (train, test)
Exemplo n.º 5
0
def main():
    nlp = spacy.load('en')
    text = ''
    for file in state_union.fileids():
        text += state_union.raw(file)
    result_dictionary = bigram_text(text, nlp)
    i = 0
    for occurrences, bigram in result_dictionary.items():
        print(bigram, occurrences)
        i = i +1
        if i > 100:
            break
Exemplo n.º 6
0
def ex4():
    from nltk.corpus import state_union
    tags = ["men", "women", "people"]
    #  for fileid in state_union.fileids():
    #    words = state_union.words(fileid)
    #    fdist = nltk.FreqDist([w.lower() for w in words])
    #    print fileid + ": ",
    #    for tag in tags:
    #      print tag + "=" + str(fdist[tag]) + " ",
    #    print
    cfd = nltk.ConditionalFreqDist((target, fileid[0:4])
                                   for fileid in state_union.fileids()
                                   for w in state_union.words(fileid)
                                   for target in tags if w.lower() == target)
    cfd.plot()
Exemplo n.º 7
0
def ex4():
  from nltk.corpus import state_union
  tags = ["men", "women", "people"]
#  for fileid in state_union.fileids():
#    words = state_union.words(fileid)
#    fdist = nltk.FreqDist([w.lower() for w in words])
#    print fileid + ": ",
#    for tag in tags:
#      print tag + "=" + str(fdist[tag]) + " ",
#    print
  cfd = nltk.ConditionalFreqDist(
    (target, fileid[0:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
      for target in tags if w.lower() == target)
  cfd.plot()
path = dir_path + "/*.txt"
list_txt = glob.glob(path)
all_toks_china = list()

for txt in list_txt:
	file_y = open(txt).read()
	tokens = word_tokenize(file_y)
	all_toks_china = all_toks_china + tokens

brown_cats = brown.categories()
all_toks_brown = list()

reuters_cats = reuters.categories()
all_toks_reuters = list()

state_union_cats = state_union.fileids()
all_toks_state_union = list()

complete_toks = list()

linux_words = open("../ref/words").read().split('\n')
linux_set = set(linux_words)

for cat in brown_cats:
	words = brown.words(categories=cat)
	tokens = [w.lower() for w in words]
	all_toks_brown = all_toks_brown + tokens
	complete_toks = complete_toks + tokens

for cat in reuters_cats:
	words = reuters.words(categories=cat)
    return entity_names

def extract_entities(taggedText):
    '''
    Create map with entity and their counts
    :param taggedText: Parsed text (output of ne chunker) in tree form
    :return: dict of entities and their freq counts
    '''
    entity_names = []
    for tree in taggedText:
        entity_names.extend(extract_entity_names(tree))
    return entity_names


#get year and words for each file
extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()]
docs, years = zip(*extracted)

#break text down into sentences, tokens
tokens = [nltk.word_tokenize(text) for text in docs]
sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs]
senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents]

#get counts of unique words and plot over time
unique = [len(set(words)) for words in tokens]
plt.scatter(years, unique)
plt.show()

#get unique/total ratio
ratios = [(float(len(set(words)))/float(len(words))) for words in tokens]
plt.scatter(years, ratios)
Exemplo n.º 10
0
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
>>> 
>>> 
>>> 
>>> 
>>> # problem 1
>>> from nltk.corpus import state_union
>>> cfd = nltk.ConditionalFreqDist((text, word)
			       for text in state_union.fileids()
			       for word in state_union.words( fileids = text ))

>>> text = state_union.fileids()
>>> contexts = ['men', 'women', 'people']
>>> cfd.tabulate(condition = text, samples = contexts)
                       men  women people 
    1945-Truman.txt      2      2     10 
    1946-Truman.txt     12      7     49 
    1947-Truman.txt      7      2     12 
    1948-Truman.txt      4      1     22 
    1949-Truman.txt      2      1     15 
    1950-Truman.txt      6      2     15 
    1951-Truman.txt      8      2      9 
1953-Eisenhower.txt      3      0     17 
1954-Eisenhower.txt      2      0     15 
Exemplo n.º 11
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 11:30:48 2018

@author: vpapg
"""

# Read in the texts of the State of the Union addresses, using the state_union
# corpus reader. Count occurrences of men, women, and people in each document.
# What has happened to the usage of these words over time?

from nltk.corpus import state_union
from nltk import ConditionalFreqDist

text = state_union.words()
print("Men:", text.count("men"))
print("Women:", text.count("women"))

Text(text).dispersion_plot(["men", "women"])

cfd = ConditionalFreqDist((target, fileid) for fileid in state_union.fileids()
                          for w in state_union.words(fileid)
                          for target in ['men', 'women']
                          if w.lower().startswith(target))
cfd.plot()

# The word 'women' appears more in recent documents, so it appears more
# over time
#Importing NLTK and download : tokenizer, tagger, stopwords, corpus
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, state_union
from nltk.tokenize import word_tokenize
from nltk.collocations import TrigramCollocationFinder
from nltk import pos_tag
nltk.download('state_union')
nltk.download('stopwords')
nltk.download('tagsets')


#make corpusList ready
corpusList = []
for i in range(len(state_union.fileids())):
    corpusList.append(state_union.raw(state_union.fileids()[i]))

#concatanete all raw texts within corpusList
allTexts = " ".join(corpusList)

#get english stop words
stop_words = set(stopwords.words('english'))

#tokenize
tokens = word_tokenize(allTexts)

#tag tokens
tagged = pos_tag(tokens)

#convert tagged tuple into dataframe for the ease of manipulation
Exemplo n.º 13
0
print 'Number of arguments:', len(sys.argv), 'arguments.'
print 'Argument List:', str(sys.argv)

phrase = sys.argv[1]
corpora = sys.argv[2]
corpus = []

#Check corpus
if corpora == "gutenberg":
    titles = gutenberg.fileids()
    for title in titles:
        corpus.append(gutenberg.raw(title))

elif corpora == "state_union":
    titles = state_union.fileids()
    for title in titles:
        corpus.append(state_union.raw(title))
else:
    print "Choose from gutenberg or state_union"
    exit(0)

vectorizer = TfidfVectorizer(min_df=1, stop_words="english")
X = vectorizer.fit_transform(corpus)

XA = X.toarray()
# print vectorizer.vocabulary_
print 'The dimensions of the TF.IDF matrix: '
print XA.shape

print 'TF.IDF computation for the ' + corpora + ' corpus is completed\n'
Exemplo n.º 14
0
def get_text():
    text = ''
    for file in state_union.fileids():
        text += state_union.raw(file)
    return text
Exemplo n.º 15
0
Arquivo: sou.py Projeto: davidar/polya
import nltk
from nltk.corpus import state_union

test  = [fid for fid in state_union.fileids() if 'Johnson' in fid]
train = [fid for fid in state_union.fileids() if fid not in test]

print 'TEST:', ', '.join(test)

f = open('sou.test.txt','w')
for w in state_union.words(test): print>>f, w
f.close()

f = open('sou.norm.test.txt','w')
for s in state_union.sents(test):
	s = ' '.join(s).lower()
	s = s.replace("' s ","'s ").replace(' .','.')
	s = ' '.join(nltk.word_tokenize(s))
	print>>f, s
f.close()

print 'TRAIN:', ', '.join(train)

f = open('sou.train.txt','w')
for w in state_union.words(train): print>>f, w
f.close()

f = open('sou.norm.train.txt','w')
for s in state_union.sents(train):
	s = ' '.join(s).lower()
	s = s.replace("' s ","'s ").replace(' .','.')
	s = ' '.join(nltk.word_tokenize(s))
23.	RP	Particle
24.	SYM	Symbol
25.	TO	to
26.	UH	Interjection
27.	VB	Verb, base form
28.	VBD	Verb, past tense
29.	VBG	Verb, gerund or present participle
30.	VBN	Verb, past participle
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
35.	WP$	Possessive wh-pronoun
36.	WRB	Wh-adverb
'''
state_union.fileids()

text = state_union.raw('2006-GWBush.txt')

train_text = state_union.raw('2005-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized1 = custom_sent_tokenizer.tokenize(text)
tagged1 = []

tokenized2 = sent_tokenize(text)
tagged2 = []

for sent in tokenized1:
    words = word_tokenize(sent)
    tagged = nltk.pos_tag(words)
    tagged1.append(tagged)
Exemplo n.º 17
0
# -*- coding: utf-8 -*-
import matplotlib

matplotlib.use('TkAgg')
import nltk
'''
☼ Read in the texts of the State of the Union addresses, using the
state_union corpus reader.  Count occurrences of men, women,
and people in each document.  What has happened to the usage of these
words over time?
'''

from nltk.corpus import state_union
#print state_union.fileids()
targets = ['men', 'women', 'people']
pair = [(target, fileid[:4]) for fileid in state_union.fileids()
        for word in state_union.words(fileid) for target in targets
        if word.lower() == target]
print pair
cfd = nltk.ConditionalFreqDist(pair)
cfd.plot()
Exemplo n.º 18
0
#Assignment:    03
#Due Date:      January 31st, 2018

import nltk

#Number 1 (2.4) in HW3

print('################ Number 1 ################')

#Generating list for each of the words through time
from nltk.corpus import state_union as su
total = []
men = []
women = []
people = []
for s in su.fileids():
    length_women = 0
    length_men = 0
    length_people = 0
    length = 0
    for w in su.words(s):
        if w.lower() == 'women':
            length_women += 1
            length += 1
        elif w.lower() == 'men':
            length_men += 1
            length += 1
        elif w.lower() == 'people':
            length_people += 1
            length += 1
    total.append(length)
Exemplo n.º 19
0
president_vocabulary = {}

for president in inaugural.fileids():
    vocab = Vocabulary(inaugural.words(president), unk_cutoff=2)
    president_vocabulary[president] = len(vocab)

inverse_vocabulary = [(value, key)
                      for key, value in president_vocabulary.items()]
print(max(inverse_vocabulary)[1],
      max(inverse_vocabulary)[0])  #richest vocabulary for Harrison in 1841
print(min(inverse_vocabulary)[1],
      min(inverse_vocabulary)[0])  #poorest vocabulary for Washington in 1793

president_vocabulary_state_union = {}

for president in state_union.fileids():
    vocab = Vocabulary(state_union.words(president), unk_cutoff=2)
    president_vocabulary_state_union[president] = len(vocab)

inverse_vocabulary_state_union = [
    (value, key) for key, value in president_vocabulary_state_union.items()
]
print(
    max(inverse_vocabulary_state_union)[1],
    max(inverse_vocabulary_state_union)
    [0])  #richest vocabulary for Truman in 1946
print(
    min(inverse_vocabulary_state_union)[1],
    min(inverse_vocabulary_state_union)
    [0])  #poorest vocabulary for Johnson in 1963
Exemplo n.º 20
0
def state_union_ts(word_list):
	cfd = nltk.ConditionalFreqDist((word.lower(), fileid[:4]) 
		for fileid in state_union.fileids()
		for word in state_union.words(fileid) if word.lower() in word_list)
	return cfd
Exemplo n.º 21
0
from nltk.corpus import gutenberg
gutenberg.fileids()
gutenberg.words('austen-emma.txt')
# word tokens
len([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])
#words
len(list(set([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])))

#3
from nltk.corpus import brown
brown.categories()
brown.words(categories='science_fiction')

#4
from nltk.corpus import state_union
state_union.fileids()
words=['men', 'women', 'people']
from nltk import ConditionalFreqDist
cfd=ConditionalFreqDist([(word, fileid) for fileid in state_union.fileids() for word in [w for w in state_union.words(fileid)]])
cfd.plot(conditions=words)

#5
word='life'
from nltk.corpus import wordnet as wn
for syn in wn.synsets(word): 
    for mer in syn.part_meronyms():
        print("Synset '{2}':\n\t{0}\n\npart meronym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
        
    for mer in syn.member_meronyms():
        print("Synset '{2}':\n\t{0}\n\nmember meronym '{1}':\n\t{3} ".format(syn.definition(),
Exemplo n.º 22
0
def question1():
    a = nltk.ConditionalFreqDist((x, id[:4]) for id in state_union.fileids()
                                 for w in state_union.words(id)
                                 for x in ['men', 'women', 'people']
                                 if w.lower().startswith(x))
    a.plot()
Exemplo n.º 23
0
#pres_avg_length = {}

def getPresFromSpeech(speech_id):
    # 2001-GWBush-1.txt
    words = speech_id.split('.')

    if len(words) > 0:
        single_words = words[0].split('-')
        if len(single_words) > 0:
            for word in single_words:
                if word.isalpha():
                    return word
    return ""

all_words = {}
for speech_id in state_union.fileids():
    text = state_union.raw(speech_id)
    words = word_tokenize(text)
    for word in words:
        if word not in all_words.keys():
            all_words[word] = 1
        else:
            all_words[word] += 1

sent_len = []
word_len = []

pres_list = []
pres_sent_total = {}
pres_word_total = {}
pres_char_total = {}
Exemplo n.º 24
0
#4

import nltk
from nltk.corpus import state_union

for speech in state_union.fileids():
    words = state_union.words(fileids=[speech])
    fdist = nltk.FreqDist(w.lower() for w in words)
    print(speech)
    print("she: ", fdist["she"], end='\n')
    print("he: ", fdist["he"], end='\n')
    print("people: ", fdist["people"], end='\n')

Exemplo n.º 25
0
def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category', end=' ')
    for word in words:
        print('%6s' % word, end=' ')
    print()
    for category in categories:
        print('%-16s' % category, end=' ')
        for word in words:
            print('%6d' % cfdist[category][word], end=' ')
        print()
        
        
cfd = nltk.ConditionalFreqDist(
    (fileid, word)
    for fileid in state_union.fileids()
    for word in state_union.words(fileid))


# In[47]:

tabulate(cfd, ['men', 'women', 'people'], state_union.fileids())


# In[55]:

#5. Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(),  substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms().

wordnet.synset('book.n.01').part_holonyms()
wordnet.synset('book.n.01').substance_holonyms()
wordnet.synset('book.n.01').member_holonyms()
Exemplo n.º 26
0
print(tempPhrase[-4:])
print(sorted(w.lower()
             for w in set(tempPhrase)))  #only sort puts capital letters first

#2 Use the corpus module to explore austen-persuasion.txt. How many word tokens does this book have? How many word types?
austen_persuasion = gutenberg.words('austen-persuasion.txt')
print("Number of word tokens = ", len(austen_persuasion))
print("Number of word types = ", len(set(austen_persuasion)))

#3 Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to access some sample text in two different genres.
print(brown.categories())
news_data = brown.words(categories='news')
religion_data = brown.words(categories='religion')

#4 Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of men, women, and people in each document. What has happened to the usage of these words over time?
print(state_union.fileids())
#cfd for inaugral address speeches for each president showing count of words american and citizen each speech
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in state_union.fileids()
                               for w in state_union.words(fileid)
                               for target in ['men', 'women']
                               if w.lower().startswith(target))
#cfd.plot()

#5 Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(), substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms().
house = wn.synsets('house')
print(house)
house = wn.synset('house.n.01')
print(house.lemma_names())
print(house.definition())
print(house.examples())
Exemplo n.º 27
0
for word in words:
    if word[:2] == "sh":
        print(word, end=" ")
print("\n")

# b
print("Words longer than 4 characters:")
for word in words:
    if len(word) > 4:
        print(word, end=" ")
print("\n")

# Exercise 2

# a
files = list(state_union.fileids())
terms = ["men", "women", "people"]
statistics = nltk.ConditionalFreqDist((file, word)
                                      for file in state_union.fileids()
                                      for word in state_union.words(file)
                                      for term in terms
                                      if word.lower() == term)
statistics.tabulate(conditions=files, samples=terms)

# b
years_raw = sorted(list(set([int(year[:4])
                             for year in state_union.fileids()])))
years = [str(year) for year in years_raw]
year_statistics = nltk.ConditionalFreqDist(
    (word.lower(), fileid[:4]) for fileid in state_union.fileids()
    for word in state_union.words(fileid) for term in terms
Exemplo n.º 28
0
import nltk
from nltk.corpus import state_union

# Plot usage of words over time
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men', 'women', 'people']
    if w.lower().startswith(target))
cfd.plot()
Exemplo n.º 29
0
# In[1]:

# NLTK imports
import nltk
from nltk.corpus import webtext
from nltk.corpus import state_union
import numpy as np

nltk.download('state_union')
nltk.download('stopwords')
nltk.download('punkt')

print("\n\n")
print('The fields are: ')
print(state_union.fileids())

# # TF.IDV Representation
# Computing the TF.IDV value of each word of each text in the corpus

# In[ ]:


# Compute the TF value of each word from a bag of words (bow)
def computeTF(wordDic, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
    return tfDict
Exemplo n.º 30
0
# Author: Jack Keane
# Date: 3/25/20
# Description: Convert state of the union speeches into csv

# Libraries
from nltk.corpus import state_union
from nltk.tokenize import sent_tokenize
import string

# Code
speeches = state_union.fileids()
f = open("../acronym_data/state_union_data.csv", "w")

for s in speeches:
    speech = state_union.raw(s)
    sentences = sent_tokenize(speech.lower().replace("\n", " "))
    for sen in sentences:
        f.write(
            sen.translate(str.maketrans('', '', string.punctuation)) + "\n")

f.close()
Exemplo n.º 31
0
# read texts from the State of the Union addresses using the state_union module
# determine the frequency of use of the words "men", "women", "people" in each document
import nltk
from nltk.corpus import state_union

state_files = state_union.fileids()
words = ['men', 'women', 'people']

cfd = nltk.ConditionalFreqDist(
    (text, word) for text in state_files for word in state_union.words(text))
cfd.tabulate(conditions=state_files, samples=words)

cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in state_files
                               for word in state_union.words(fileid)
                               for target in words
                               if word.lower().startswith(target))
cfd.plot()

# analyze the frequency chart of modal verbs for different genres
# find other word use classes that also differ in different genres
import nltk
import nltk.corpus

corpus_name = nltk.corpus.brown
files = corpus_name.fileids()
modals = ['can', 'could', 'may', 'might', 'must', 'will']
commons = ['the', 'be', 'to', 'of', 'and', 'in', 'that']
adjectives = ['good', 'new', 'first', 'last', 'long']
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

cfd = nltk.ConditionalFreqDist((genre, word)
Exemplo n.º 32
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from gensim import corpora, models

_files_all_speechs = state_union.fileids()
all_raw_speeches = []
for _file_ in _files_all_speechs:
    all_raw_speeches.append(state_union.raw(_file_))

#print('Number of Speeches:', len(all_raw_speeches))
all_categories = [x.split('-')[1].split('.')[0] for x in _files_all_speechs]
#print(all_categories)

stopwords = nltk.corpus.stopwords
eng_stopwords = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()


def basic_preprocessing(text):
    text = text.lower()  #lowering
    text = re.sub(
        r'\[.*?\]', '', text
    )  #removing all instances of citation brackets found in wiki articles
    text = word_tokenize(text)
    text = [word for word in text
            if word not in eng_stopwords]  #removing stop words
    text = [word for word in text
            if len(word) > 1]  #removing single character tokens
Exemplo n.º 33
0
len(persuasion)
len(set(persuasion))

#3.
from nltk.corpus import brown
brown.fileids()
brown.categories()
brown.words(categories='adventure')

#4.
from nltk.corpus import state_union

text = state_union.words()
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men','women']
    if w.lower().startswith(target))
cfd.plot()

#5.
wn.synset('fish.n.01').part_meronyms()
wn.synset('fish.n.01').member_meronyms()
wn.synset('leaf.n.01').substance_meronyms()
wn.synset('fish.n.01').member_holonyms()
wn.synset('leaf.n.01').substance_holonyms()

#6.cannot translate among 3 languages at a time, loop to solve
from nltk.corpus import swadesh
Exemplo n.º 34
0
 def __init__(self):
     self.number_id = 40
     self.source_id = "state_union"
     self.titles = [name for name in state_union.fileids()]
     self.data = [state_union.raw(name) for name in self.titles]
    return entity_names

def extract_entities(taggedText):
    '''
    Create map with entity and their counts
    :param taggedText: Parsed text (output of ne chunker) in tree form
    :return: dict of entities and their freq counts
    '''
    entity_names = []
    for tree in taggedText:
        entity_names.extend(extract_entity_names(tree))
    return entity_names


#get year and words for each file
extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()]
docs, years = zip(*extracted)

#break text down into sentences, tokens
tokens = [nltk.word_tokenize(text) for text in docs]
sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs]
senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents]

#get counts of unique words and plot over time
unique = [len(set(words)) for words in tokens]
plt.scatter(years, unique)
plt.show()

#get unique/total ratio
ratios = [(float(len(set(words)))/float(len(words))) for words in tokens]
plt.scatter(years, ratios)
Exemplo n.º 36
0
    #Gives an accuracy of 88% on test data
    return [clf_bern, clf_tree, vectorizer]


[clf_bern, clf_tree, vectorizer] = train_questions()


#this method classifies to which category a new question belongs
def classify_question(question, vectorizer=vectorizer, clf=clf_bern):
    b = vectorizer.transform([question])
    b = b.toarray()
    return clf.predict(b)[0]


#loading the data set of different minutes of the meets
lisa = state_union.fileids()
dataset = []
for ele in lisa:
    dataset.append(state_union.raw(ele))
for i in range(len(dataset)):
    dataset[i] = dataset[i].encode('utf-8')


#this funtion finds the most important words in the nth meet.
def important_words(n, dataset=dataset):
    data = dataset
    #removing punctuations and \n from the data
    for i in range(len(data)):
        data[i] = data[i].translate(None, string.punctuation)
        data[i] = data[i].translate(None, "\n")