/
Slistener.py
189 lines (176 loc) · 8.29 KB
/
Slistener.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from tweepy import StreamListener
from Tweets import *
import Cosine_Sim
from collections import Counter
import json, time, sys,traceback
from datetime import datetime
from email.utils import parsedate_tz, mktime_tz
import os
import glob
class SListener(StreamListener):
def __init__(self, api,EventDtct):
self.api = api
self.EventDtct = EventDtct
self.writeTime_InHours = 0.5
self.ClusterActiveTime_InHours = 2.5
# last time we save the clusters to files
#self.EventDtct.last_time_print = None
def on_data(self,data):
data2 = json.loads(data)
try:
user_id = data2['user']['id_str'].encode('ascii','ignore')
country = ""
lang = None
text = ""
if data2[u'text'] is not None:
text = (data2[u'text']).encode('ascii','ignore')
if data2['place'] is not None:
country = data2['place']['country']
if data2['lang'] is not None:
lang = data2['lang']
if lang == "en" and str.strip(text) != "" and user_id is not None and str.strip(user_id) != "":
text = text.replace('\n', ' ').replace('\r', '')
tweet_time = data2['created_at']
timestamp = mktime_tz(parsedate_tz(tweet_time))
tweet_time = str(datetime.fromtimestamp(timestamp))
t = Tweet(text,user_id,tweet_time,country,lang)
#print t
self.clusterTweet(t)
except:
print "Bad Tweet"
print traceback.print_exc(file=sys.stdout)
#return True
return True
def on_error(self,status):
print status
return True
def clusterTweet(self,tweet):
if len(self.EventDtct.clusters) == 0:
c = Cluster(tweet)
c.id = str(id(c))+str(int(round(time.time())))
self.EventDtct.clusters.append(c)
print "First cluster created !"
else:
#find the most similar cluster to add the tweet to it
print "checking current clusters"
max_simialiry = 0
cluster_index = -1
active_clusters = []
for i in range(0,len(self.EventDtct.clusters)):
print "checking cluster ", i , " for time diff"
if self.isClusterActive(self.EventDtct.clusters[i],self.ClusterActiveTime_InHours):
print "cluster was active checkig sim"
active_clusters.append(self.EventDtct.clusters[i])
cur_sim = self.ClusterTweetSim(self.EventDtct.clusters[i],tweet)
if max_simialiry < cur_sim and cur_sim < 1.0:
max_simialiry = cur_sim
cluster_index = i
# check if the most similar cluster is above threshold else create new cluster
print "most similar cluster have sim = ", max_simialiry
if max_simialiry >= self.EventDtct.dist_threshold and cluster_index != -1:
self.EventDtct.clusters[cluster_index].addTweet(tweet)
clusterSize = len(self.EventDtct.clusters[cluster_index].data.index)
if (self.EventDtct.clusters[cluster_index].userDiversity > self.EventDtct.divers_threshold) and (clusterSize >= 10):
self.EventDtct.Events.append(self.EventDtct.clusters[cluster_index])
self.writeToEventsFile(self.EventDtct.clusters[cluster_index])
print "Event has been detected !!!!!!!!!!!!!!!!!!!!"
#else:
#print "NOT EVENT !!!!!!!", str(self.EventDtct.clusters[cluster_index].userDiversity), str(len(self.EventDtct.clusters[cluster_index].data.index))
print "Tweets is added to a current cluster"
else:
print "Created new cluster for the tweet"
c = Cluster(tweet)
c.id = str(id(c))+str(int(round(time.time())))
active_clusters.append(c)
self.EventDtct.clusters = list(active_clusters)
# decide to write the clusters or not ?
print "checking whether to write the clusters to the files, clusters count : " , len(active_clusters)
if self.EventDtct.last_time_print is None and len(self.EventDtct.clusters) >= 50:
self.writeClusters()
print "write the clusters for the first time "
elif self.EventDtct.last_time_print is not None:
tdelta = datetime.now() - self.EventDtct.last_time_print
secs = tdelta.total_seconds()
hours = abs(float(secs / 3600))
if hours >= self.writeTime_InHours:
self.writeClusters()
print "write the clusters after write time exceeded"
def ClusterTweetSim(self,cluster,tweet):
tweet_txt = tweet.text
min_similariy = 1.0
total_sim = 0
for index, row in cluster.data.iterrows():
curr_sim = Cosine_Sim.get_cosine(tweet_txt,row['tweet_text'])
if round(curr_sim,2) >= 0.99:# duplicate tweet
return 1 # return once you find the duplicate
else:
total_sim = total_sim + curr_sim
#average similarity
min_similariy = (total_sim*1.0/len(cluster.data.index))
return round(min_similariy,2)
# def ClusterTweetSim(self,cluster,tweet):
# tweet_txt = tweet.text
# min_similariy = 1.0
# for index, row in cluster.data.iterrows():
# curr_sim = Cosine_Sim.get_cosine(tweet_txt,row['tweet_text'])
# if round(curr_sim,2) >= 0.99:# duplicate tweet
# return 1 # return once you find the duplicate
# elif min_similariy > curr_sim:
# min_similariy = curr_sim
# return round(min_similariy,2)
def isClusterActive(self,cluster,active_time_inHours):
c_active_time = datetime.strptime(cluster.last_activity, '%Y-%m-%d %H:%M:%S')
time_diff = abs(float((datetime.now()-c_active_time).total_seconds()/3600))
#print "Time diff for cluster", abs(time_diff) , round(abs(time_diff),2)
if round(abs(time_diff),2) <= active_time_inHours:
#print "It is activeeeeeeeeeeeeeeeee"
# check number of unique users in the cluster are diverse
clusterSize = len(cluster.data.index)
if (clusterSize >= 10) and (math.isnan(cluster.userDiversity) or round(cluster.userDiversity,2) <= 0.15):
# not enought unique users
return False
else:
return True
else:
#print "Notttttttttttttttttt activeeeeeeeeeeeeeeeee"
return False
def writeClusters(self):
print "Writing Clusters to a file"
# delete old clusters
files = glob.glob("clusters/*.csv")
for f in files:
os.remove(f)
## write data
cnt = 1
for c in self.EventDtct.clusters:
try:
st = 'clusters/cluster_'+str(c.id)+'.csv'
c.data.to_csv(st,index=False,encoding='utf-8')
cnt = cnt + 1
except:
print "Error in writing cluster :::::::::"
print c.data
print traceback.print_exc(file=sys.stdout)
print "Clusters Have been written to files"
self.EventDtct.last_time_print = datetime.now()
def writeToEventsFile(self,event_clust):
print "Writing Events to a file"
words_list = []
for index, row in event_clust.data.iterrows():
words_list += Cosine_Sim.tokenize_only(row['tweet_text'])
word_counts = Counter(words_list)
most_common = word_counts.most_common(10)
text_file = open("events/Events.txt", "a")
text_file.write("Cluster Id ="+str(event_clust.id)+" ,")
for word, count in most_common:
text_file.write("{0} : {1} ,".format(word, count))
text_file.write("\n")
text_file.close()
# write cluster to csv file
clust_file = 'events/cluster_'+str(event_clust.id)+'.csv'
if os.path.exists(clust_file):
os.remove(clust_file)
try:
event_clust.data.to_csv(clust_file,index=False,encoding='utf-8')
except:
print " Error writing the Event File"