-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizeSummary.py
215 lines (175 loc) · 7.92 KB
/
tokenizeSummary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
tokenizeSummary.py
Created by Aaron Erlich on 2013-02-01.
"""
import sys
import os
import pickle
import nltk
import numpy
import scipy
import matplotlib
import textmining
import pandas
import string
path = #insert your path
os.chdir(path)
#os.chdir('/Volumes/Optibay-1TB/Python/scrapingCode/immigCongRec/')
pkl_file = open('summary.pkl', 'rb')
billSummaries = pickle.load(pkl_file)
#There are lots of different ways to read in and deal with our texts and interact with the NLTK package
#We will cover several different ways
#normalize text
#lemmatize
#stem
#how do i convert to an nltk text?
#you can load the built in machine learned sentence tokenizer for English
sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
#####TOKENIZE###############
#tokenize by sentence - I will maintain the document structure because maybe we want it for something
bill_sents =[]
for bill in billSummaries:
bill_sents.append(sent_tokenize.tokenize(bill))
bill_sents[1][-1]
#tokenize by word
from nltk.tokenize import word_tokenize #there are different word tokenizers -- this one keeps the period at the end of the sentence with the word
from nltk.tokenize import wordpunct_tokenize #there are different word tokenizers -- this one doesn't
#see http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
bill_words = [[word_tokenize(sent) for sent in bill] for bill in bill_sents]
bill_words[1][-1][:] # all words in last sentence of second bill
#LEMMATIZE/NORMALIZE##########
bill_words2 = [[[word.lower() for word in sent] for sent in bill] for bill in bill_words] #make it lowercase and keep structure
bill_words2[1][-1][:] #now lower case
from nltk.corpus import stopwords
print(stopwords.words('english')) # look at the stopwords
#we now remove all words that are shorter than 4 characters and which are not stopwords
bill_words3 = [[filter(lambda word: word not in stopwords.words('english') and len(word) >3, word) for word in bill] for bill in bill_words2]
bill_words3[1][-1][:] #now lower case and short words removed
#now we stem
Pstemmer = nltk.stem.PorterStemmer()
bill_words4 = [[[Pstemmer.stem(word) for word in sent] for sent in bill] for bill in bill_words3]
bill_words4[1]
bill_words4[1][-1][:]
#we have done this to maintain the structure -- but let's say we just want to look at all the bills together
allBills= ' '.join(billSummaries)
type(allBills)
allWords = wordpunct_tokenize(allBills)
allWords = [word.lower() for word in allWords]
#we can recognize parts of speech, proper names and man other things here if we wanted
allWords2 = filter(lambda word: word not in stopwords.words('english') and len(word) >3, allWords)
allWords3 = [Pstemmer.stem(word) for word in allWords2]
sorted(set(allWords3))
billsVocab = sorted(set(allWords3))
len(sorted(set(allWords3))) #unique non-punctuation or abbreviation
len(allWords)
allWords3.count("withheld")
allWords3.count("withdrawn")
allWords3.count("worthless")
allWords3.count("solut")
from __future__ import division
len(allWords3)/len(set(allWords3)) #lexical diversity -- can compare
fd = nltk.FreqDist(allWords3) #create an nltk frequency distribution from our list
# a frequency distribtuion is similary to but of a special dictionary with keys as the words and counts as the values
type(fd)
fd.keys() #shows you all the words
fd.values() #shows you all the values
fd.plot(50, cumulative=True) #show the 50 most common words on a graph
fd.plot(50, cumulative=False)
fd.tabulate(50) # I really should have removed the word title but since it's in all bills it's not really going to matter
fd.tabulate(10,20)
fd.hapaxes() #words (or what linguists call samples) that only occur once
fd.Nr(20) #number of words occurring exactly 20 times
fd.Nr(1)
fd.N() == len(allWords3)#same as length for list of all words
fd.B() #same as len(unique vocab)
fd.samples() #returns unique vocab
fd["worthless"] #returns the countt
fd.max()
counter = 0
#lets look at 200 of the most popular items and there counts
for i in range(20): #loop 10 times
fd.tabulate(counter, counter+10)
raw_input("Hit Enter")
counter += 10
#############################################################
#or we might want to create an nltk corpus
#############################################################
from nltk.corpus import PlaintextCorpusReader
#http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.plaintext.PlaintextCorpusReader-class.html
summaries_root = "/Volumes/Optibay-1TB/Python/scrapingCode/billSummaries/" #say where your text file is
billsCorpora = PlaintextCorpusReader(summaries_root, r'h[1-9].*\.txt') #read it into the corpora
dir(billsCorpora) # see what methods can be used on the corpora
rawBills = billsCorpora.raw() #all the rawtext
billsCorpora.fileids() #show the files in the corpora
billsCorpora.fileids()[0] #show the first fileid
billsCorpora.raw()[1:30]
len(billsCorpora.fileids()) #show how many files
len(billsCorpora.sents())
#billsCorpora.fileids('h1.txt')
billsCorpora.raw() #the whole corpora
billsCorpora.sents() #all of the sentences in the corpora
billsCorpora.sents()[1]
billsCorpora.words() #all of the words in all of the corpora
billsCorpora.words('h1.txt') #all of the words in one corposa
twoBillsWords = billsCorpora.words(['h1.txt', 'h1447.txt'])
twoBillsWords[500:525] #not that puncutation is included and bills have not been lemmatized
billWords = billsCorpora.words()
#taken from NLTK blog -- and then modified. Create a frequency distribution from a corpus
def count_stems(corpus):
stemmer = nltk.stem.PorterStemmer()
stopset = set(nltk.corpus.stopwords.words('english')) | set(string.punctuation)
fd = nltk.probability.FreqDist()
for word in corpus.words():
w = word.lower()
if w in stopset: continue
fd.inc(stemmer.stem(w))
return fd
#a very simple function to get dictionary counts
def make_count(corpus, wordSet):
billCounts =[]
for fileName in corpus.fileids():
#print(fileName)
counter =0
docWords = corpus.words(fileName)
for word in docWords:
#print(word)
w = word.lower()
if w in wordSet:
# i could also use in the fd.inc approach here and it's proabably better-just showing another option.
print(w + " is in " + fileName)
counter+= 1
billCounts.append(counter)
return billCounts
from nltk.corpus.reader import WordListCorpusReader
path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords"
reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list
crime = reader.words()
crime = [word.lower().strip() for word in crime]
crimeSet = set([w.lower() for w in crime])
crimeCount = make_count(billsCorpora, crimeSet)
fd = count_stems(billsCorpora)
counter = 0
#lets look at 200 of the most popular items and there counts
#you could use the csv writer methods or this which is kind of hacky
mywordlist = numpy.asarray([billsCorpora.fileids(), crimeCount])
mywordlist[0][1] #name
mywordlist[1][1] #count
wordlistdf = pandas.DataFrame(mywordlist.transpose(), columns=["fileName", "count"])
wordlistdf['fileName'] #access a column
wordlistdf.fileName[5] #another way to access a column - now the fifth element
wordlistdf[:5] $access the rows with slicing
#change to your director
wordlistdf.to_csv('/Volumes/Optibay-1TB/Python/scrapingCode/immigCongRec/crimeCounts.csv', index=True, header=True, sep=',')
a = numpy.asarray([fd.keys(), fd.values()])
df = pandas.DataFrame(a.transpose(), columns=["word", "count"])
df.to_csv('/Volumes/Optibay-1TB/Python/scrapingCode/immigCongRec/billFrequencies.csv', index=True, header=True, sep=',')
#lots of ways to make a term document matrix -- let's choose the simplest
#often there are cutoffs for how often the word has to appear
bills5 = [[' '.join(sentence) for sentence in bill] for bill in bill_words4]
#http://text-processing.com/demo/tag/
tdm = textmining.TermDocumentMatrix()
for bill in bills5:
tdm.add_doc(bill[0])
tdm.write_csv('CongMatrix.csv', cutoff=1)
#see Andrew Hall's page at Harvard for a slightly more complex implementaiton
# http://www.andrewbenjaminhall.com/wp-content/uploads/2011/12/Gen_TDM.txt