forked from geosoco/twitter_capture_client
/
client.py
430 lines (299 loc) · 11.2 KB
/
client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#!/usr/bin/env python
"""Twitter capture client code."""
import argparse
import logging
import logging.config
from time import sleep
from datetime import datetime
import signal
from listeners.file import RotatingFileListener
from configfile import ConfigFile
from server_messenger import ServerMessenger, CaptureStatus
from streamer import Streamer
VERSION = 0.1
STATUS_UNKNOWN = 0
STATUS_CREATED = 1
STATUS_STARTING = 2
STATUS_STARTED = 3
STATUS_STOPPING = 4
STATUS_STOPPED = 5
STATUS_UNRESPONSIVE = 6
STATUS_DEAD = 7
log = logging.getLogger(__name__)
running = True
def on_interrupt(sig, stack):
"""handle interrupt signaler."""
global running
log.info("got interrupt %s", sig)
running = False
log.info("set running to false (%s)", running)
class Client(object):
"""Basic Client class for polling server for jobs."""
def __init__(
self,
base_url,
token,
ping_interval,
update_interval,
twitter_auth,
output_config,
source_addr=None):
"""construct client."""
self.ping_interval = ping_interval
self.update_interval = update_interval
self.output_config = output_config
self.source_addr = source_addr
self.active_job = None
self.active_job_id = None
self.log = logging.getLogger("Client")
self.sm = ServerMessenger(
base_url=base_url,
token=token
)
self.job_checker = JobChecker(self.sm)
def wait_for_job(self):
"""poll for an active job assignment."""
while running is True:
active_job = self.job_checker.getActiveJob()
if active_job is not None:
# store data
self.active_job = active_job
self.active_job_id = active_job["id"]
# check status
status = CaptureStatus(self.active_job["status"])
if status.running():
return
# sleep
sleep(self.ping_interval)
def start_collection(self):
"""Start collection."""
# create our streamer
collection_name = self.active_job["name"]
self.listener = RotatingFileListener(
collection_name=collection_name,
**self.output_config
)
self.stream = Streamer(
listener=self.listener,
api_key=self.twitter_auth["api_key"],
api_secret=self.twitter_auth["api_secret"],
access_token=self.twitter_auth["access_token"],
access_token_secret=self.twitter_auth["access_token_secret"],
source_addr=self.source_addr
)
# set job id in server messenger
self.sm.active_job_id = self.active_job_id
def run_collection(self):
"""Run collection."""
# start the collection
self.start_collection()
# make sure we got a valid stream
if self.stream is None:
log.error("stream was not started")
return
# old state defaults
old_status = CaptureStatus(CaptureStatus.STATUS_UNKNOWN)
# initialize keyword details
term_checker = TermChecker(self.server_messenger)
# we haven't updated yet
last_update = None
# archived
archived_date = None
# run while valid
while running is True and archived_date is None:
# get job status from server
status_msg = self.sm.getStatus()
# if we got null, there's a problem with the server,
# sleep and continue
if status_msg is None:
sleep(update_interval)
continue
# set up the status
status = CaptureStatus(
status_msg['status'] if 'status'
in status_msg
else STATUS_UNKNOWN)
self.log.debug("got status: %d", status_msg['status'])
# look for archived date and bail immediately
archived_date = status_msg["archived_date"]
if archived_date is not None:
continue
# are there any keyword changes?
term_checker.checkTerms()
# has the status changed?
if old_status != status:
self.log.info("changing status#1 %d -> %d", old_status, status)
if status.isRunning():
if not self.stream.isRunning():
self.log.info("Starting stream")
self.stream.track_list = term_checker.terms
sm_total_count = status_msg['total_count']
if sm_total_count is not None:
self.listener.total = sm_total_count
self.stream.start()
# ackknowledge that we have the newest keywords in here
term_checker.resetTermsChanged()
elif status.isStopped():
if self.stream.isRunning():
self.log.info("Stopping stream")
self.stream.stop()
elif term_checker.haveTermsChanged():
self.stream.track_list = self.term_checker.terms
if self.stream.isRunning():
self.log.debug("restarting streams for keywords")
self.stream.stop()
sleep(ping_interval)
self.stream.start()
term_checker.resetTermsChanged()
# sleep
sleep(self.ping_interval)
# new status
new_status = CaptureStatus(STATUS_UNKNOWN)
if self.stream.isRunning():
self.log.debug("stream exists and is running")
if status != STATUS_STOPPING:
new_status = STATUS_STARTED
else:
if status != STATUS_STARTING:
self.log.debug(
"stream exists but is not running (forcing %d -> %d",
status,
new_status)
new_status = STATUS_STOPPED
# if there's a discrepancy
if new_status != status and new_status != STATUS_UNKNOWN:
self.log.info("changing status#2 %d -> %d", status, new_status)
self.sm.updateStatus(new_status)
# update the old status
old_status = new_status
# self.log.debug("running - %s", running)
# output status
# send update status to server if we're running
if self.stream.isRunning():
self.listener.print_status()
self.sm.pingServer(self.listener.total, self.listener.rate)
do_update = False
if last_update is None:
self.log.debug("initial update")
do_update = True
else:
delta = datetime.now() - last_update
if delta.total_seconds() > self.update_interval:
self.log.debug(
"update delta: %f",
delta.total_seconds())
do_update = True
# else:
# self.log.debug("delta: %f", delta.total_seconds())
# update to server
if do_update is True:
self.sm.putUpdate(
self.listener.received,
self.listener.total,
self.listener.rate
)
last_update = datetime.now()
else:
self.log.debug("waiting for update")
# wait for stream to stop
if self.stream.isRunning():
self.log.info("Stopping...")
self.stream.stop()
while self.stream.isRunning():
self.log.info("Waiting for self.logger to stop")
sleep(1)
# allow our listener and stream to be deleted
self.stream = None
self.listener = None
def run(self):
"""Start up the client running machine."""
while running is True:
# wait for an active job
self.wait_for_job()
if self.active_job is not None:
self.log.info(
"working on job (id: %d, name: %s)",
self.active_job_id,
self.active_job["name"]
)
# start collection
self.run_collection()
else:
sleep(self.ping_interval)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--configfile",
default="config.json",
help="config file to use"
)
args = parser.parse_args()
# config file
config = ConfigFile(args.configfile)
# configure the logging
logging_config = config.getValue(
"client.logging",
default=None,
alternatePaths="logging")
import json
print repr(logging_config)
print "\n" * 4
print json.dumps(logging_config, index=2)
quit()
if logging_config is not None:
logging.config.dictConfig(logging_config)
else:
# set up some default logging options
logging.basicConfig(
format="%(asctime)s|%(levelname)s|%(name)s - %(message)s",
level=logging.DEBUG
)
logging.getLogger("twitter_monitor.listener").setLevel(
logging.getLevelName('WARN')
)
# extract new args
base_url = config.getValue("server.base_url", None)
auth_token = config.getValue("server.auth_token", None)
client_id = config.getValue("server.client_id", None)
ping_interval = config.getValue("server.ping_interval", 5.0)
update_interval = config.getValue("server.update_interval", 60.0)
twitter_auth = config.getValue("twitter_auth", None)
output_conf = config.getValue("output", None)
source_addr = config.getValue("source_addr", None)
print "ta", twitter_auth
try:
if not base_url:
raise Exception("base_url")
if not auth_token:
raise Exception("auth_token")
if not client_id:
raise Exception("id")
if not twitter_auth:
raise Exception("twitter_auth")
if not output_conf:
raise Exception("output")
if "base_dir" not in output_conf:
raise Exception("output.base_dir")
except Exception, e:
msg = "%s was not specified in the config file" % (e.message)
log.error(msg)
quit()
# set signal handlers
signal.signal(signal.SIGINT, on_interrupt)
signal.signal(signal.SIGHUP, on_interrupt)
signal.signal(signal.SIGQUIT, on_interrupt)
# signal.signal(signal.SIGKILL, on_interrupt)
signal.signal(signal.SIGTERM, on_interrupt)
print client_id
print type(client_id)
# configure the server messenger
# create the server messenger
client = Client(
base_url,
auth_token,
ping_interval,
update_interval,
twitter_auth,
output_conf)
client.run()
quit()