def __init__(self, project_dir=None): """ Constructor for Twicorder class. Sets up the task manager, query exchange, worker thread and query types. Keyword Args: project_dir (str): Path to Twicorder project directory """ if project_dir: ProjectManager.project_dir = project_dir # Todo: Only import logger after project dir is set, to ensure logging # to project dir. This is ugly and needs a better solution. from twicorder.utils import TwiLogger global logger logger = TwiLogger() # Test setup before continuing try: from twicorder.config import Config from twicorder.tasks import TaskManager from twicorder.auth import Auth Config.get() TaskManager.load() Auth.session() except TwicorderException as error: logger.critical(error) sys.exit(1) return from twicorder.tasks import TaskManager self._task_manager = TaskManager() self._worker_thread = WorkerThread() self._query_types = {}
def _read_loop(self, resp): charset = resp.headers.get('content-type', default='') enc_search = re.search('charset=(?P<enc>\S*)', charset) if enc_search is not None: encoding = enc_search.group('enc') else: encoding = 'utf-8' buf = ReadBuffer(resp.raw, self.chunk_size, encoding=encoding) while self.running and not resp.raw.closed: length = 0 try: while not resp.raw.closed: line = buf.read_line() or '' stripped_line = line.strip() if not stripped_line: # keep-alive new lines are expected self.listener.keep_alive() elif stripped_line.isdigit(): length = int(stripped_line) break else: raise TweepError('Expecting length, unexpected value found') next_status_obj = buf.read_len(length) except Exception as error: TwiLogger.exception('Unable to process response: \n') continue if self.running and next_status_obj: self._data(next_status_obj) if resp.raw.closed: self.on_closed(resp)
def id_to_screenname(self): now = datetime.now() time_since_lookup = now - (self._id_to_screenname_time or now) expiry = timedelta(minutes=15) if self._id_to_screenname and time_since_lookup <= expiry: return self._id_to_screenname for follow_id in self.follow: user = self.api.get_user(follow_id) self._id_to_screenname[follow_id] = '@{}'.format(user.screen_name) self._id_to_screenname_time = datetime.now() TwiLogger.info(self._id_to_screenname) return self._id_to_screenname
def on_data(self, json_data): """ Defines the actions to take on data capture. Caching all available user data and writing tweet data to disk. Args: json_data (str): String containing tweet data on JSON format Returns: bool: True if successful """ self._rate_limit_retry_count = 0 os.makedirs(self.output_dir, exist_ok=True) file_path = os.path.join(self.output_dir, self.file_name) data = json.loads(json_data) if data.get('created_at'): users = utils.collect_key_values('user', data) for user in users: user['recorded_at'] = data['created_at'] self.users[user['id_str']] = user if self.config.get('full_user_mentions', False): self.update_mentions(data) # Add tweet to MongoDB if self.config.get('use_mongo', True) and self.mongo_collection: try: mongo_data = copy.deepcopy(data) mongo_data = utils.timestamp_to_datetime(mongo_data) mongo_data = utils.stream_to_search(mongo_data) self.mongo_collection.replace_one( {'id': mongo_data['id']}, mongo_data, upsert=True ) except Exception: TwiLogger.exception( 'Twicorder Listener: Unable to connect to MongoDB: ' ) self._data.append(data) utils.write(json.dumps(data) + '\n', file_path) timestamp = '{:%d %b %Y %H:%M:%S}'.format(datetime.now()) tweet = self.get_full_text(data) if not tweet: return True user = data.get('user', {}).get('screen_name', '-') oneline_tweet = tweet.replace('\n', ' ') TwiLogger.info(f'{timestamp}, @{user}: {oneline_tweet}') return True
def stats(): from collections import Counter try: collection = mongo.create_collection() data = { 'All Tweets': f'{collection.count():,}', } accounts = { 'slpng_giants', 'slpng_giants_be', 'slpng_giants_bg', 'slpng_giants_br', 'slpng_giants_ca', 'slpng_giants_ch', 'slpng_giants_de', 'slpng_giants_es', 'slpng_giants_eu', 'slpng_giants_fr', 'slpng_giants_it', 'slpng_giants_nl', 'slpng_giants_no', 'slpng_giants_nz', 'slpng_giants_oz', 'slpng_giants_se', } for account in sorted(accounts): data[f'@{account}'] = ( f'{collection.find({"user.screen_name": account}).count():,}') counter = Counter() # for tweet in collection.find({"user.screen_name": 'slpng_giants'}): # try: # d = tweet['created_at'] # counter[f'Date({d.year}, {d.month - 1}, {d.day})'] += 1 # except Exception: # continue date_count = sorted([f'[ new {k}, {v} ],' for k, v in counter.items()]) return render_template('stats.html', title='Stats', data=data, date_count='\n'.join(date_count)) except Exception: TwiLogger.exception('TwiBrowser stats error: ') return redirect(url_for('index'))
def create_collection(db_name='slpng_giants', collection_name='tweets'): """ Create collection for the given database. Skip an return early if collection exists. Args: db_name (str): Database name collection_name (str): Collection name Returns: Collection: Created collection. """ try: client = MongoClient() if not is_connected(client): return db = client[db_name] if collection_name in db.list_collection_names(): return db[collection_name] collection = db[collection_name] collection.create_index('id', unique=True) collection.create_index('created_at') collection.create_index('retweet_count') collection.create_index('favorite_count') collection.create_index('in_reply_to_status_id') collection.create_index('in_reply_to_user_id') collection.create_index('in_reply_to_screen_name') collection.create_index('entities.hashtags') collection.create_index('user.created_at') collection.create_index('user.screen_name') collection.create_index('user.id') collection.create_index('user.followers_count') collection.create_index('user.favourites_count') collection.create_index('user.verified') collection.create_index('user.statuses_count') collection.create_index([('full_text', TEXT)], default_language='english') return collection except Exception: TwiLogger.exception('Unable to connect to MongoDB: ') return
def backfill(path=None, db_name='slpng_giants', collection_name='tweets'): tweets = create_collection(db_name, collection_name) config = Config.get() save_dir = os.path.expanduser(path or config['output_dir']) paths = glob.glob(os.path.join(save_dir, '**', '*.t*'), recursive=True) t0 = datetime.now() for idx, path in enumerate(paths): if os.path.basename(os.path.dirname(path)) != 'stream': continue try: for lidx, line in enumerate(utils.readlines(path)): try: data = json.loads(line) except Exception: TwiLogger.exception( f'Backfill: Unable to read line {path}:{lidx + 1}') continue else: if data.get('delete'): continue if os.path.basename(os.path.dirname(path)) == 'stream': data = utils.stream_to_search(data) data = utils.timestamp_to_datetime(data) tweets.replace_one({'id': data['id']}, data, upsert=True) t_delta = datetime.now() - t0 average = t_delta / (idx + 1) remaining = str((len(paths) - (idx + 1)) * average).split('.')[0] TwiLogger.info(f'{idx + 1}/{len(paths)} ' f'{remaining} ' f'{os.sep.join(path.split(os.sep)[-3:])}') except Exception: TwiLogger.exception(f'Backfill: Unable to read file: {path}')
def add(self, query): """ Finds appropriate queue for given end point and adds it. Args: query (BaseQuery): Query object """ queue = self.get_queue(query.endpoint) if query in queue.queue: TwiLogger.info(f'Query with ID {query.uid} is already in the queue.') return thread = self.threads.get(query.endpoint) if thread and thread.query == query: TwiLogger.info(f'Query with ID {query.uid} is already running.') return queue.put(query) TwiLogger.info(query)
def run(self): """ Fetches query from queue and executes it. """ while True: self._query = self.queue.get() if self.query is None: TwiLogger.info(f'Terminating thread "{self.name}"') break while not self.query.done: try: self.query.run() except Exception: import traceback TwiLogger.exception(traceback.format_exc()) break TwiLogger.info(self.query.fetch_log()) time.sleep(.2) time.sleep(.5) self.queue.task_done()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import time from datetime import datetime from queue import Queue from threading import Thread from twicorder.utils import TwiLogger logger = TwiLogger() class RateLimitCentral: """ Class keeping track of end points and their rate limits. """ _limits = {} @classmethod def update(cls, endpoint, header): """ Update endpoint with latest rate limit information. Args: endpoint (str): Endpoint header (dict): Query response header """ limit_keys = {
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from twicorder.web.browser import app from twicorder.utils import TwiLogger if __name__ == '__main__': try: app.run('localhost') except Exception: TwiLogger.exception('TwiBrowser Error: ')
def track(self): track_list = [t for t in self.config.get('track') or [] if t] or None if track_list and self.follow_also_tracks: track_list += self.id_to_screenname.values() TwiLogger.info('Tracking: ', track_list) return track_list