/
SimpleTimeline.py
254 lines (202 loc) · 8.97 KB
/
SimpleTimeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
"""Functions for calculating timelines based on counting"""
from collections import Counter
from collections import defaultdict
from time import gmtime, strftime
import DateArticlePair
import operator
import timex
import re
from nltk.tokenize import word_tokenize
import math
import TextRank
import FrequencySummarizer
def add_date(string, counter, timespan=0):
""" Adds the date to the counter
Timespan defines the timespan that the timeline is using
0 - Days
1 - Weeks
2 - Months
3 - Years
add_date will remove all dates with less precision,
and put all dates with more precision into the correct bin
"""
if timespan == 0:
# Only use the strings with the day specified
if len(string) == 10:
counter[string] += 1
else:
print "Timespans of weeks, months, and years are not implemented yet"
def remove_invalid_dates(should_filter, string, counter):
""" Checks to see if the date string is valid.
If the string is valid, adds 1 to that key in the counter
If filter is false, will not check if the string is valid
Currently only checks to see if year is in the correct range
"""
if not should_filter:
counter[string] += 1
else:
try:
year = int(string[0:4])
month = int(string[5:7])
if year == 1987 and (month == 3 or month == 4):
add_date(string, counter)
except ValueError:
pass
def count_word_word_matrix(corpus, dist=7):
""" Returns a dictionary of dictionaries, both indexed on words
Each of the secondary dictionaries should contain the number of
times that the current key occurs near the key for the entire
dictionary in the training corpus.
"""
matrix = {}
is_number = re.compile("\d")
for article_index in range(len(corpus)):
tokens = word_tokenize(corpus[article_index][1])
for count in range(len(tokens)):
word = tokens[count].lower()
#check to see if the current word is a number
if bool(is_number.search(word)):
word = "<NUMBER>"
#if word is not currently in the matrix, create a defaultdict for it
if word not in matrix:
matrix[word] = defaultdict(int)
high = count+dist
tmp = count-dist
#checks to see if outside token array
if tmp < 0:
matrix[word][" "] -= tmp
matrix[word][" "] += (high - len(tokens)) + 1
while tmp < len(tokens):
if bool(is_number.search(tokens[tmp])):
matrix[word]["<NUMBER>"] += 1
else:
matrix[word][tokens[tmp].lower()] += 1
tmp += 1
return matrix
def calc_word_similarity(first_word, second_word, matrix):
''' Returns the cosine similarity between the two words
Value should be between 0 and 1
'''
first_word = first_word.lower()
second_word = second_word.lower()
if first_word not in matrix or second_word not in matrix:
return 0
# Exploits python iteration operations for speed up
# Logically the same as the following six lines of commented code
first_dot_second = sum([matrix[first_word][k]*matrix[second_word][k] for k in matrix[first_word] if k in matrix[second_word]]) if len(matrix[first_word])<len(matrix[second_word]) else sum([matrix[first_word][k]*matrix[second_word][k] for k in matrix[second_word] if k in matrix[first_word]])
first_length = math.sqrt(sum(matrix[first_word][k]**2 for k in matrix[first_word]))
second_length = math.sqrt(sum(matrix[second_word][k]**2 for k in matrix[second_word]))
# for key in matrix[first_word]:
# # first_dot_second += matrix[first_word][key] * matrix[second_word][key]
# first_length += matrix[first_word][key]**2
# second_length = 0.0
# for key in matrix[second_word]:
# second_length += matrix[second_word][key]**2
return first_dot_second / (first_length * second_length)
def calc_sentence_similarity(first_sentence, second_sentence, matrix):
''' Calulates the total similarity based on the best word similarity from
pass through. Returns the total similary divided by the length of the
number of tokens in the first sentence
'''
first_tokens = word_tokenize(first_sentence)
second_tokens = word_tokenize(second_sentence)
total_similarity = 0.0
for token in first_tokens:
best_word_similarity = 0.0
for canidate in second_tokens:
similarity = calc_word_similarity(token, canidate, matrix)
if similarity > best_word_similarity:
best_word_similarity = similarity
total_similarity += best_word_similarity
return total_similarity / len(first_tokens)
def summarize_by_word_similarity(sentences, matrix):
''' Returns the best sentence to summarize the text by calculating the
sentence similarity when comparing every sentence to one another
'''
best_sentence_similarity = 0
best_sentence = ""
outer_count = 0
for canidate_sentence in sentences:
#print canidate_sentence
total_similarity = 0
count = 0
for sentence in sentences:
total_similarity += calc_sentence_similarity(canidate_sentence, sentence, matrix)
count += 1
#print "Checked ", count, "/", len(sentences)
if total_similarity > best_sentence_similarity:
best_sentence = canidate_sentence
best_sentence_similarity = total_similarity
outer_count += 1
#print "Outer count ", outer_count, "/", len(sentences)
return best_sentence
def summarize_with_TextRank(sentences, matrix):
return TextRank.extractSentencesFromSentenceTokens(sentences)
def summarize_with_FrequencySummarizer(sentences, matrix, num_sent=1):
return FrequencySummarizer.summarize(sentences, num_sent)
def select_best_dates(path, num_dates=None, use_article_date=1, filter_dates=False, summarization_function=summarize_with_TextRank):
""" Returns an ordered list of the most common dates and a 1 sentence summary
in the files containted in the path.
use_article_date
0 - never use article date
1 - use article date when the article contains no other dates
2 - always use article date
"""
date_article_pairs = DateArticlePair.read_reuters(path)
print "Articles read"
matrix = []
#updates the matrix if summarizing through word similarity
if summarization_function == summarize_by_word_similarity:
matrix = count_word_word_matrix(date_article_pairs)
date_counter = Counter()
sentence_list = {}
#gets date references
for (article_date, article_text) in date_article_pairs:
date_sentence_pairs = timex.extract_dates(article_text, article_date)
#remove invalid dates based on flags
if use_article_date == 2 or (use_article_date == 1 and len(date_sentence_pairs) == 0):
remove_invalid_dates(filter_dates, "%s" % (article_date.date()), date_counter)
for (date, sentence) in date_sentence_pairs:
remove_invalid_dates(filter_dates, date, date_counter)
if date not in sentence_list:
sentence_list[date] = []
sentence_list[date].append(sentence)
dates_to_return = date_counter.most_common()
if num_dates is not None:
dates_to_return = date_counter.most_common(num_dates)
date_with_summarization = []
for date in dates_to_return:
#print date
date_with_summarization.append((date[0], date[1], summarization_function(sentence_list[date[0]], matrix)))
return date_with_summarization
def count_article_dates(path):
"""
Prints a list of dates lead by the most common dates, and features the %
of articles written on that date relative to all articles written, given a
particular path to articles.
Returns this list, but with only the dates/counts of those dates; the %
calculation is done in-method.
"""
dates = dict()
count = 0
for (article_date, article_text) in DateArticlePair.read_reuters(path):
if article_date.year > 1000 and article_date.year < 2100:
dates[article_date] = dates.get(article_date, 0) + 1
count += 1
ret = sorted(dates.items(), reverse=True, key=operator.itemgetter(1))
for date in ret:
print str(date[0].month)+"/"+str(date[0].day)+"/"+str(date[0].year)+", "+str(date[1])+", "+str(date[1]/float(count))
return ret
if __name__ == '__main__':
"""
fn = strftime("%H-%M-%S-%d%b%Y")+ "_output.txt"
fl = open(fn, "w")
fl.writelines("%s\n" % item for item in select_best_dates('test/', filter_dates=True))
fl.close()
-trying to make it write the output of the function to a file but getting a runtime error.
"""
# count_article_dates('reuters/')
# print "\n-Article Publication Dates Evaluated-\n"
best = select_best_dates('reuters/', filter_dates=True)
for i in range(len(best)):
print best[i], "\n\n"