forked from CornellNLP/CS4300_Flask_template
/
preprocess_data.py
245 lines (200 loc) · 8.1 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import gc
import json
import scipy
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.sparse import csr_matrix, load_npz
from sklearn.feature_extraction.text import TfidfVectorizer
"""NOTE: this file contains a lot of random stuff from attempts to vectorize
the data. At present, only the vectorize function (immedietely below) and vectorize reddit are used.
There contains code to create a custom tfidf matrix but it is slow. There also
exists code to tfidf the all the news dataset."""
def vectorize():
"""
The proper method to run to vectorize the reuters headlines.
Handles both data cleaning and creating and saving the data neccesary
to create the tfidf matrix a vectorize future queries.
"""
news = pd.read_csv('data/reu_identifiers.csv', names=['date', 'id', 'title'],usecols=['id', 'title'])
news = news[news['title'].isnull() == False]
news = news.drop_duplicates(subset='title')
news = news[news.title.str.contains('UPDATE') == False]
news = news[news.title.str.contains('CORRECTED') == False]
news = news[news.title.str.contains('CORRECTION') == False]
news.index = np.arange(len(news))
gc.collect()
vectorizer = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
gc.collect()
X = vectorizer.fit_transform((title for title in news['title'])).astype(dtype=np.float16)
gc.collect()
matrix_ix_to_id = {}
for ix, row in news.iterrows():
matrix_ix_to_id[ix] = row['id']
gc.collect()
scipy.sparse.save_npz('tfidf_mat.npz', X)
gc.collect()
#pickle.dump(id_to_vec, open('reu_tfidf.p', 'wb'))
with open('matrix_ix_to_id.json', 'w') as f:
json.dump(matrix_ix_to_id, f)
gc.collect()
with open('vocab_to_ix.json', 'w') as f:
json.dump(vectorizer.vocabulary_, f)
gc.collect()
np.save('idf_vals.npy', vectorizer.idf_)
def vectorize_reddit():
"""
Method used to vectorize reddit data. Creates tfidf matrix,
and saves useful information corresponding to the tfidf matrix.
"""
data = pd.read_csv('data/reddit_data.csv', names=['date', 'score', 'number of comments', 'title', 'url'], skiprows=1)
vectorizer = TfidfVectorizer(min_df=3, stop_words='english')
titles = data['title']
reddit_tfidf = vectorizer.fit_transform(title for title in titles)
scipy.sparse.save_npz('reddit_tfidf_mat.npz', reddit_tfidf)
mat_idx_to_tup = {}
for idx, row in data.iterrows():
tup = (row['date'], row['title'], row['score'], row['number of comments'], row['url'])
mat_idx_to_tup[idx] = tup
with open('reddit_ix_to_tup.json', 'w') as f:
json.dump(mat_idx_to_tup, f)
f.close()
with open('reddit_vocab_to_ix.json', 'w') as f:
json.dump(vectorizer.vocabulary_, f)
f.close()
np.save('reddit_idf_vals.npy', vectorizer.idf_)
def reu_id_to_title():
id_to_title = {}
for ix, row in news.iterrows():
id_to_title[row['id']] = row['title']
with open('id_to_reu_headline.json', 'w') as f:
json.dump(id_to_title, f)
def vectorize_reu_iden():
helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
tfidf_preprocessor = helper.build_preprocessor()
tfidf_tokenizer = helper.build_tokenizer()
news = pd.read_csv('data/reu_identifiers.csv', names=['date', 'id', 'title'],usecols=['id', 'title'])
news = news[news['title'].isnull() == False]
news = news[2283884:] #2016 on
news.reindex(labels=np.arange(len(news)))
gc.collect()
article_tf = {}
doc_freq = defaultdict(lambda : 0)
unique_toks = set()
for ix, story in news.iterrows():
tf_dict = defaultdict(lambda : 0)
tokens = tfidf_tokenizer(story['title'])
story_unique_toks = set(tokens)
for tok in tokens:
tf_dict[tok] += 1
for tok in story_unique_toks:
unique_toks.add(tok)
doc_freq[tok] += 1
article_tf[story['id']] = tf_dict
gc.collect()
return article_tf, doc_freq, unique_toks
def create_tfidf(article_tf, doc_freq, unique_toks):
word_to_ix = {}
for ix, word in enumerate(list(unique_toks)):
word_to_ix[word] = ix
gc.collect()
tfidf_dict = {}
for term_freq in article_tf:
tfidf_dict[doc] = {}
for word in term_freq:
tfidf_weight = term_freq[word] / doc_freq[word]
tfidf_dict[doc][word_to_ix[word]] = tfidf_weight
gc.collect()
return tfidf_dict, word_to_ix, doc_freq
def tokenize(text):
"""Returns a list of words that make up the text.
Note: for simplicity, lowercase everything.
Requirement: Use Regex to satisfy this function
Params: {text: String}
Returns: Array
"""
# YOUR CODE HERE
lower = text.lower()
tokens = re.findall('[a-z]+', lower)
return tokens
def make_tf_dict(data):
article_tf = {}
unique_toks = set()
for ix, article in data.iterrows():
tf_dict = defaultdict(lambda: 0)
tokens = tokenize(article['content'])
for tok in tokens:
tf_dict[tok] += 1
unique_toks.add(tok)
article_tf[article['id']] = tf_dict
return article_tf, unique_toks
def iterate_data():
dat1 = pd.read_csv('data/articles1.csv')
dat2 = pd.read_csv('data/articles2.csv')
dat3 = pd.read_csv('data/articles3.csv')
article_tf1, unique_toks1 = make_tf_dict(dat1)
article_tf2, unique_toks2 = make_tf_dict(dat2)
article_tf3, unique_toks3 = make_tf_dict(dat3)
unique_toks = unique_toks1.union(unique_toks2).union(unique_toks3)
print('Unique Tokens:', len(unique_toks))
article_tf = copy.copy(article_tf1.update(article_tf2).update(article_tf3))
doc_freq = defaultdict(lambda: 0)
for article in article_tf:
for word in article:
doc_freq[word] += 1
def custom_tfidf():
article_tf, doc_freq, unique_toks = vectorize_reu_iden()
tfidf_dict, word_to_ix, doc_freq = create_tfidf(article_tf, doc_freq, unique_toks)
with open('tfidf_dict.json', 'w') as f:
json.dump(tfidf_dict, f)
with open('word_to_ix.json', 'w') as f:
json.dump(word_to_ix, f)
with open('doc_freq.json', 'w') as f:
json.dump(doc_freq, f)
def generarte_autocomplete_vocab():
with open("vocab_to_ix.json", 'r') as f:
data = json.load(f)
vocab = [str(key) for key, val in data.iteritems()]
f.close()
with open('tfidf_mat.npz', 'r') as f1:
tfidf_mat= load_npz(f1)
co_occurence_mat = (tfidf_mat.T)*tfidf_mat
f1.close()
with open('tfidf_mat.npz', 'r') as f1:
with open("ix_to_vocab.json", 'r') as f2:
tfidf_mat= load_npz(f1)
ix_to_vocab = json.load(f2)
sum_arr = csr_matrix.sum(tfidf_mat, axis=0)
x = np.argsort(sum_arr)
words_arr = []
for ix in range(116754):
val = x[0, ix]
word = ix_to_vocab[str(val)]
words_arr.append(word.encode("utf8"))
f1.close()
f2.close()
low_bound = len(words_arr) - 1000
word_refined = words_arr[low_bound:]
no_ints = [word for word in word_refined if not word.isdigit()]
with open("vocab_to_ix.json", 'r') as f:
with open("ix_to_vocab.json", 'r') as f2:
ix_to_vocab = json.load(f2)
vocab_to_ix = json.load(f)
bigrams = []
for word in no_ints:
ix = vocab_to_ix[word]
sorted_row = np.argsort(co_occurence_mat[ix, :].toarray()[0])[::-1]
#print(sorted_row)
#First index is the same word, so take the second and third word
ix1, ix2 = sorted_row[1], sorted_row[2]
bigram_1 = word.encode("utf8") + " " + ix_to_vocab[str(ix1)].encode("utf8")
bigram_2 = word.encode("utf8") + " " + ix_to_vocab[str(ix2)].encode("utf8")
bigrams.append(bigram_1)
bigrams.append(bigram_2)
f.close()
f2.close()
with open("autocomplete_bigram_vocab.pickle", "wb") as outfile:
pickle.dump(bigrams, outfile)
if __name__ == '__main__':
generarte_autocomplete_vocab()