def __init__(self, tweet=None): self.is_retweet = self._is_retweet(tweet) self.tweet = self._get_tweet(tweet) self.logger = logging.getLogger(__name__) self.stream_config_reader = ProjectConfig() self.relevant_text = '' self.matching_keywords = {}
def trending_topics_velocity(debug=False): logger = get_logger(debug) # Cleanup (remove old trending tweets from redis) project_config = ProjectConfig() for project_config in project_config.read(): if project_config['compile_trending_topics']: tt = TrendingTopics(project_config['slug']) tt.update()
def cleanup(debug=False): logger = get_logger(debug) # Cleanup (remove old trending tweets from redis) project_config = ProjectConfig() projects = [] for project_config in project_config.read(): projects.append(project_config['slug']) if project_config['compile_trending_tweets']: tt = TrendingTweets(project_config['slug']) tt.cleanup() # cleanup tweet store ts = TweetStore() ts.cleanup(projects)
def public_data_dump_ids(debug=False): logger = get_logger(debug) if config.ENV != 'prd': logger.info( f'Data dumps are only collected in production environments.') return project_config = ProjectConfig() for project_config in project_config.read(): if project_config['compile_data_dump_ids']: data_dump_ids = DataDumpIds(project_config['slug']) data_dump_ids.sync() for mode in ['has_place', 'has_coordinates']: data_dump_ids = DataDumpIds(project_config['slug'], mode=mode) data_dump_ids.sync()
def start(): d = DockerWrapper() stream_container_name = app.config['STREAM_DOCKER_CONTAINER_NAME'] status = d.container_status(stream_container_name) if status == 'running': return Response("Stream has already started.", status=400, mimetype='text/plain') stream_config = ProjectConfig() is_valid, response_invalid = stream_config.validate_streaming_config() if not is_valid: return Response(response_invalid, status=400, mimetype='text/plain') d.unpause_container(stream_container_name) status = d.container_status(stream_container_name) if status == 'running': return Response("Successfully started stream.", status=200, mimetype='text/plain') else: return Response("Starting stream was not successful ", status=400, mimetype='text/plain')
def stream_activity(): es_activity_threshold_min = int(request.args.get('es_activity_threshold_min', 10)) redis_counts_threshold_hours = int(request.args.get('redis_counts_threshold_hours', 2)) # elasticsearch counts es_count = es.count_recent_documents(since='now-{}m'.format(es_activity_threshold_min)) # redis counts e = datetime.now() s = e - timedelta(hours=redis_counts_threshold_hours) redis_s3_queue = RedisS3Queue() stream_config_reader = ProjectConfig() dates = list(redis_s3_queue.daterange(s, e, hourly=True)) redis_count = 0 for stream in stream_config_reader.read(): for d in dates: d, h = d.split(':') redis_count += redis_s3_queue.get_counts(stream['slug'], d, h) return jsonify({'redis_count': redis_count, 'es_count': es_count})
def get_trending_tweets(project): args = request.get_json() if args is None: args = {} num_tweets = args.get('num_tweets', 10) min_score = args.get('min_score', 5) sample_from = args.get('sample_from', 100) query = args.get('query', '') pc = ProjectConfig() project_config = pc.get_config_by_slug(project) if project_config is None: return error_response(400, 'No project found with this slug') if not project_config['compile_trending_tweets']: return error_response(400, 'This project is configured to not collect trending tweets information.') tt = TrendingTweets(project, es_index_name=project_config['es_index_name']) resp = tt.get_trending_tweets(num_tweets, query=query, sample_from=sample_from, min_score=min_score) return jsonify(resp)
def get_trending_topics(project): args = request.get_json() if args is None: args = {} num_topics = args.get('num_topics', 10) pc = ProjectConfig() project_config = pc.get_config_by_slug(project) if project_config is None: return error_response(400, 'No project found with this slug') if not project_config['compile_trending_topics']: return error_response(400, 'This project is configured to not collect trending topic information.') tt = TrendingTopics(project) try: resp = tt.get_trending_topics(num_topics) except: return jsonify([]) return jsonify(resp)
def es_bulk_index(debug=True): logger = get_logger(debug) es_queue = ESQueue() project_config = ProjectConfig() project_keys = es_queue.find_projects_in_queue() if len(project_keys) == 0: logger.info('No work available. Goodbye!') return predictions_by_project = {} es_actions = [] for key in project_keys: es_queue_objs = es_queue.pop_all(key) if len(es_queue_objs) == 0: continue project = key.decode().split(':')[-1] logger.info( f'Found {len(es_queue_objs):,} tweets in queue for project {project}.' ) stream_config = project_config.get_config_by_slug(project) # compile actions for bulk indexing es_queue_objs = [json.loads(t.decode()) for t in es_queue_objs] actions = [{ '_id': t['id'], '_type': 'tweet', '_source': t['processed_tweet'], '_index': stream_config['es_index_name'] } for t in es_queue_objs] es_actions.extend(actions) # compile predictions to be added to prediction queue after indexing predictions_by_project[project] = [ t['text_for_prediction'] for t in es_queue_objs if 'text_for_prediction' in t ] # bulk index if len(es_actions) > 0: success = es.bulk_actions_in_batches(es_actions, batch_size=1000) if not success: # dump data to disk es_queue.dump_to_disk(es_actions, 'es_bulk_indexing_errors') return # Queue up for prediction for project, objs_to_predict in predictions_by_project.items(): predict_queue = PredictQueue(project) predict_queue.multi_push(objs_to_predict)
def _get_projects_stats(self, num_days=7, hourly=False): project_config = ProjectConfig() redis_s3_queue = RedisS3Queue() end_day = datetime.utcnow() start_day = end_day - timedelta(days=num_days) stats = '' dates = list( redis_s3_queue.daterange(start_day, end_day, hourly=hourly)) now_utc = pytz.utc.localize(end_day) timezone_hour_delta = get_tz_difference() total = defaultdict(lambda: 0) for stream in project_config.read(): total_by_project = defaultdict(lambda: 0) project = stream['es_index_name'] project_slug = stream['slug'] stats += "<h3>{}</h3>".format(project) count_types = ['tweets'] if stream['image_storage_mode'] != 'inactive': count_types += ['photo', 'animated_gif'] for count_type in count_types: stats += '<h4>{}</h4>'.format(count_type) for d in dates: if hourly: d, h = d.split(':') count = redis_s3_queue.get_counts( project_slug, d, h, media_type=count_type) corrected_hour = (datetime.strptime(h, '%H') - timezone_hour_delta).strftime('%H') stats += '{0} ({1}:00 - {1}:59): {2:,}<br>'.format( d, corrected_hour, count) else: count = redis_s3_queue.get_counts( project_slug, d, media_type=count_type) stats += '{}: {:,}<br>'.format(d, count) total[count_type] += count total_by_project[count_type] += count stats += 'Total: {:,}<br><br>'.format( total_by_project[count_type]) total_stats = 'Total {}:'.format('today' if hourly else 'this week') total_stats += '<ul>' for count_type, count in total.items(): total_stats += "<li>{}: {:,}</li>".format(count_type, count) total_stats += '</ul>' return stats, total_stats
def send_to_s3(debug=False): logger = get_logger(debug) s3_handler = S3Handler() redis_queue = RedisS3Queue() logger.info('Pushing tweets to S3') project_keys = redis_queue.find_projects_in_queue() project_config = ProjectConfig() if len(project_keys) == 0: logger.info('No work available. Goodbye!') return for key in project_keys: project = key.decode().split(':')[-1] logger.info('Found {:,} new tweet(s) in project {}'.format( redis_queue.num_elements_in_queue(key), project)) stream_config = project_config.get_config_by_slug(project) now = datetime.datetime.now() f_name = 'tweets-{}-{}.jsonl'.format(now.strftime("%Y%m%d%H%M%S"), str(uuid.uuid4())) # dump data from redis into a temporary file tmp_file_path = os.path.join(os.path.join('/', 'tmp', f_name)) with open(tmp_file_path, 'wb') as f: for tweets in redis_queue.pop_all_iter(key): f.write(b'\n'.join(tweets) + b'\n') # compress temporary file f_name_gz = f_name + '.gz' tmp_file_path_gz = os.path.join(os.path.join('/', 'tmp', f_name_gz)) compress(tmp_file_path, tmp_file_path_gz) os.remove(tmp_file_path) # upload to S3 s3_key = 'tweets/{}/{}/{}'.format(stream_config['es_index_name'], now.strftime("%Y-%m-%d"), f_name_gz) if s3_handler.upload_file(tmp_file_path_gz, s3_key): logging.info(f'Successfully uploaded file {s3_key} to S3') os.remove(tmp_file_path_gz) else: logging.error( f'ERROR: Upload of file {s3_key} to S3 not successful')
class StreamManager(): def __init__(self, auth, listener, chunk_size=1536): # High chunk_size means lower latency but higher processing efficiency self.logger = logging.getLogger('stream') self.stream = Stream(auth=auth, listener=listener, tweet_mode='extended', parser=tweepy.parsers.JSONParser(), chunk_size=chunk_size) self.stream_config = ProjectConfig() def start(self): config = self.stream_config.get_pooled_config() self.logger.info('Starting to track for keywords {} in languages {}'.format(config['keywords'], config['lang'])) self.stream.filter(track=config['keywords'], languages=config['lang'], encoding='utf-8', stall_warnings=True) def stop(self): self.logger.info('Stopping stream...') try: self.stream.disconnect() except: pass
def manage_config(): logger = logging.getLogger('pipeline') config = request.get_json() pc = ProjectConfig() if request.method == 'GET': # read streaming config config = pc.read() return jsonify(config), 200 else: # write streaming config # make sure new configuration is valid is_valid, msg = pc.is_valid(config) if not is_valid: return error_response(400, msg) # write everything to config pc.write(config) # Create new Elasticsearch indices if needed es.update_es_indices(pc.get_es_index_names(config)) return success_response(200, 'Successfully updated configuration files. Make sure to restart stream for changes to be active.')
def es_predict(debug=True): logger = get_logger(debug) project_config = ProjectConfig() predictions = {} for project_config in project_config.read(): if len(project_config['model_endpoints']) > 0: project = project_config['slug'] predict_queue = PredictQueue(project) predict_objs = predict_queue.pop_all() if len(predict_objs) == 0: logger.info(f'Nothing to predict for project {project}') texts = [t['text'] for t in predict_objs] ids = [t['id'] for t in predict_objs] es_index_name = project_config['es_index_name'] for question_tag, endpoints_obj in project_config[ 'model_endpoints'].items(): for endpoint_name, endpoint_info in endpoints_obj[ 'active'].items(): model_type = endpoint_info['model_type'] run_name = endpoint_info['run_name'] predictor = Predict(endpoint_name, model_type) preds = predictor.predict(texts) for _id, _pred in zip(ids, preds): if es_index_name not in predictions: predictions[es_index_name] = {} if _id not in predictions[es_index_name]: predictions[es_index_name][_id] = {} if question_tag not in predictions[es_index_name][_id]: predictions[es_index_name][_id][question_tag] = { 'endpoints': {} } predictions[es_index_name][_id][question_tag][ 'endpoints'][run_name] = { 'label': _pred['labels'][0], 'probability': _pred['probabilities'][0] } # if present, add label vals (numeric values of labels) if 'label_vals' in _pred: predictions[es_index_name][_id][question_tag][ 'endpoints'][run_name]['label_val'] = _pred[ 'label_vals'][0] if endpoints_obj['primary'] == endpoint_name: # current endpoint is primary endpoint predictions[es_index_name][_id][question_tag][ 'primary_endpoint'] = run_name predictions[es_index_name][_id][question_tag][ 'primary_label'] = _pred['labels'][0] if 'label_vals' in _pred: predictions[es_index_name][_id][question_tag][ 'primary_label_val'] = _pred['label_vals'][ 0] if len(predictions) > 0: actions = [] for es_index_name, pred_es_index in predictions.items(): for _id, pred_obj in pred_es_index.items(): actions.append({ '_id': _id, '_type': 'tweet', '_op_type': 'update', '_index': es_index_name, '_source': { 'doc': { 'meta': pred_obj } } }) success = es.bulk_actions_in_batches(actions) if not success: # dump data to disk es_queue = ESQueue() es_queue.dump_to_disk(actions, 'es_bulk_update_errors')
class ReverseTweetMatcher(): """Tries to reverse match a tweet object given a set of keyword lists and languages.""" def __init__(self, tweet=None): self.is_retweet = self._is_retweet(tweet) self.tweet = self._get_tweet(tweet) self.logger = logging.getLogger(__name__) self.stream_config_reader = ProjectConfig() self.relevant_text = '' self.matching_keywords = {} def get_candidates(self): relevant_text = self.fetch_all_relevant_text() config = self.stream_config_reader.read() if len(config) == 0: return [] elif len(config) == 1: # only one possibility self._find_matching_keywords_for_project(relevant_text, config[0]) return [config[0]['slug']] else: # try to match to configs return self._match_to_config(relevant_text, config) def fetch_all_relevant_text(self): """Here we pool all relevant text within the tweet to do the matching. From the twitter docs: "Specifically, the text attribute of the Tweet, expanded_url and display_url for links and media, text for hashtags, and screen_name for user mentions are checked for matches." https://developer.twitter.com/en/docs/tweets/filter-realtime/guides/basic-stream-parameters.html """ text = '' if 'extended_tweet' in self.tweet: text += self.tweet['extended_tweet']['full_text'] text += self._fetch_user_mentions(self.tweet['extended_tweet']) text += self._fetch_urls(self.tweet['extended_tweet']) else: text += self.tweet['text'] text += self._fetch_user_mentions(self.tweet) text += self._fetch_urls(self.tweet) # pool together with text from quoted tweet if 'quoted_status' in self.tweet: if 'extended_tweet' in self.tweet['quoted_status']: text += self.tweet['quoted_status']['extended_tweet']['full_text'] text += self._fetch_user_mentions(self.tweet['quoted_status']['extended_tweet']) text += self._fetch_urls(self.tweet['quoted_status']['extended_tweet']) else: text += self.tweet['quoted_status']['text'] text += self._fetch_user_mentions(self.tweet['quoted_status']) text += self._fetch_urls(self.tweet['quoted_status']) # store as member for debugging use self.relevant_text = text return text # private methods def _find_matching_keywords_for_project(self, relevant_text, config): """Find matching_keywords""" relevant_text = relevant_text.lower() keywords = [k.lower().split() for k in config['keywords']] matching_keywords = defaultdict(list) for keyword_list in keywords: if len(keyword_list) == 1: if keyword_list[0] in relevant_text: matching_keywords[config['slug']].append(keyword_list[0]) else: # keywords with more than one word: Check if all words are contained in text match_result = re.findall(r'{}'.format('|'.join(keyword_list)), relevant_text) if set(match_result) == set(keyword_list): matching_keywords[config['slug']].extend(keyword_list) self.matching_keywords = dict(matching_keywords) def _match_to_config(self, relevant_text, config): """Match text to config in stream""" relevant_text = relevant_text.lower() match_candidates = set() matching_keywords_by_project = defaultdict(list) for c in config: # else find match for keywords to relevant text keywords = [k.lower().split() for k in c['keywords']] for keyword_list in keywords: if len(keyword_list) == 1: if keyword_list[0] in relevant_text: match_candidates.add(c['slug']) matching_keywords_by_project[c['slug']].append(keyword_list[0]) else: # keywords with more than one word: Check if all words are contained in text match_result = re.findall(r'{}'.format('|'.join(keyword_list)), relevant_text) if set(match_result) == set(keyword_list): match_candidates.add(c['slug']) matching_keywords_by_project[c['slug']].extend(keyword_list) # filter by language setting config_dict = {c['slug']: c for c in config} lang_tweet = self.tweet['lang'] candidates = set() for c in match_candidates: languages = config_dict[c]['lang'] # add as match if language matches, no language was specified or language could not be detected by Twitter ('und') if lang_tweet in languages or len(languages) == 0 or lang_tweet == 'und': self.matching_keywords[c] = matching_keywords_by_project[c] candidates.add(c) return list(candidates) def _fetch_urls(self, obj): t = [] if 'urls' in obj['entities']: for u in obj['entities']['urls']: t.append(u['expanded_url']) if 'extended_entities' in obj: if 'media' in obj['extended_entities']: for m in obj['extended_entities']['media']: t.append(m['expanded_url']) return ''.join(t) def _fetch_user_mentions(self, obj): t = [] if 'user_mentions' in obj['entities']: for user_mention in obj['entities']['user_mentions']: t.append(user_mention['screen_name']) return ''.join(t) def _get_tweet(self, tweet): if self.is_retweet: return tweet['retweeted_status'] else: return tweet def _is_retweet(self, tweet): return 'retweeted_status' in tweet
def main(args): f_path = os.path.join(args.input) docs = [] num_docs = 0 with open(f_path, 'r') as f: for line in f: doc = json.loads(line) try: docs.append({ 'id': doc['_id'], 'text': process(doc['_source']['text']) }) except KeyError: logger.warning(f'Doc {doc} is missing text/id column') continue num_docs += 1 if num_docs % 10000 == 0: logger.info(f'Loaded {num_docs:,} documents...') if len(docs) == 0: logger.info('No documents loaded.') return pc = ProjectConfig() session = boto3.Session(profile_name='crowdbreaks-dev') project_config = pc.get_config_by_index_name(args.index) if project_config is None: raise ValueError(f'Project {args.index} not found in config file.') predictions = {} if len(project_config['model_endpoints']) > 0: project = project_config['slug'] texts = [t['text'] for t in docs] ids = [t['id'] for t in docs] es_index_name = project_config['es_index_name'] for question_tag, endpoints_obj in project_config[ 'model_endpoints'].items(): for endpoint_name, endpoint_info in endpoints_obj['active'].items( ): model_type = endpoint_info['model_type'] run_name = endpoint_info['run_name'] logger.info(f'Running predictions for run {run_name}') model = get_model(args.run_dir, run_name) label_mapping = get_label_mapping(args.run_dir, run_name) preds = predict( model, label_mapping, texts, legacy=(run_name == 'fasttext_v1' and args.index == 'project_vaccine_sentiment')) for _id, _pred in zip(ids, preds): if es_index_name not in predictions: predictions[es_index_name] = {} if _id not in predictions[es_index_name]: predictions[es_index_name][_id] = {} if question_tag not in predictions[es_index_name][_id]: predictions[es_index_name][_id][question_tag] = { 'endpoints': {} } predictions[es_index_name][_id][question_tag]['endpoints'][ run_name] = { 'label': _pred['labels'][0], 'probability': _pred['probabilities'][0] } # if present, add label vals (numeric values of labels) if 'label_vals' in _pred: predictions[es_index_name][_id][question_tag][ 'endpoints'][run_name]['label_val'] = _pred[ 'label_vals'][0] if endpoints_obj['primary'] == endpoint_name: # current endpoint is primary endpoint predictions[es_index_name][_id][question_tag][ 'primary_endpoint'] = run_name predictions[es_index_name][_id][question_tag][ 'primary_label'] = _pred['labels'][0] if 'label_vals' in _pred: predictions[es_index_name][_id][question_tag][ 'primary_label_val'] = _pred['label_vals'][0] if len(predictions) > 0: ts = int(time.time()) f_out = os.path.join('cache', f'predictions_{args.index}_{ts}.jsonl') logger.info(f'Writing predictions to file {f_out}...') with open(f_out, 'a') as f: for es_index_name, pred_es_index in predictions.items(): for _id, pred_obj in pred_es_index.items(): f.write( json.dumps({ '_id': _id, '_type': 'tweet', '_op_type': 'update', '_index': es_index_name, '_source': { 'doc': { 'meta': pred_obj } } }) + '\n') else: logger.info('No predictions were made. No files written.')
def __init__(self, auth, listener, chunk_size=1536): # High chunk_size means lower latency but higher processing efficiency self.logger = logging.getLogger('stream') self.stream = Stream(auth=auth, listener=listener, tweet_mode='extended', parser=tweepy.parsers.JSONParser(), chunk_size=chunk_size) self.stream_config = ProjectConfig()
def handle_tweet(tweet, send_to_es=True, use_pq=True, debug=False, store_unmatched_tweets=False): logger = get_task_logger(__name__) if debug: logger.setLevel(logging.DEBUG) # reverse match to find project rtm = ReverseTweetMatcher(tweet=tweet) candidates = rtm.get_candidates() tweet_id = tweet['id_str'] # open Redis connection only once # redis = Redis() # connection = redis.get_connection() connection = None if len(candidates) == 0: # Could not match keywords. This might occur quite frequently e.g. when tweets are collected accross different languages/keywords logger.info(f'Tweet {tweet_id} could not be matched against any existing projects.') if store_unmatched_tweets: # store to separate file for later analysis with open(os.path.join(config.PROJECT_ROOT, 'logs', 'reverse_match_errors', f'{tweet_id}.json'), 'w') as f: json.dump(tweet, f) return # queue up for s3 upload and add to priority queue logger.info("SUCCESS: Found {} project(s) ({}) as a matching project for tweet".format(len(candidates), ', '.join(candidates))) redis_queue = RedisS3Queue(connection=connection) es_queue = ESQueue(connection=connection) stream_config_reader = ProjectConfig() for project in candidates: stream_config = stream_config_reader.get_config_by_slug(project) if stream_config['storage_mode'] == 'test_mode': logger.debug('Running in test mode. Not sending to S3 or ES.') return # add tracking info tweet['_tracking_info'] = stream_config_reader.get_tracking_info(project) tweet['_tracking_info']['matching_keywords'] = rtm.matching_keywords[project] # Queue up on Redis for subsequent upload redis_queue.push(json.dumps(tweet).encode(), project) # preprocess tweet pt = ProcessTweet(tweet, project_locales=stream_config['locales']) pt.process() # Possibly add tweet to trending tweets if stream_config['compile_trending_tweets']: trending_tweets = TrendingTweets(project, project_locales=stream_config['locales'], connection=connection) trending_tweets.process(tweet) # Extract trending topics if stream_config['compile_trending_topics']: trending_topics = TrendingTopics(project, project_locales=stream_config['locales'], project_keywords=stream_config['keywords'], connection=connection) trending_topics.process(tweet) if stream_config['compile_data_dump_ids'] and config.ENV == 'prd': data_dump_ids = DataDumpIds(project, connection=connection) data_dump_ids.add(tweet_id) if pt.has_place: data_dump_ids = DataDumpIds(project, mode='has_place', connection=connection) data_dump_ids.add(tweet_id) if pt.has_coordinates: data_dump_ids = DataDumpIds(project, mode='has_coordinates', connection=connection) data_dump_ids.add(tweet_id) if use_pq and pt.should_be_annotated(): # add to Tweet ID queue for crowd labelling logger.info(f'Add tweet {tweet_id} to priority queue...') processed_tweet = pt.get_processed_tweet() tid = TweetIdQueue(stream_config['es_index_name'], priority_threshold=3, connection=connection) processed_tweet['text'] = pt.get_text(anonymize=True) tid.add_tweet(tweet_id, processed_tweet, priority=0) if stream_config['image_storage_mode'] != 'inactive': pm = ProcessMedia(tweet, project, image_storage_mode=stream_config['image_storage_mode']) pm.process() if send_to_es and stream_config['storage_mode'] in ['s3-es', 's3-es-no-retweets']: if rtm.is_retweet and stream_config['storage_mode'] == 's3-es-no-retweets': # Do not store retweets on ES return # send to ES processed_tweet = pt.get_processed_tweet() logger.debug(f'Pushing processed with id {tweet_id} to ES queue') es_tweet_obj = {'processed_tweet': processed_tweet, 'id': tweet_id} if len(stream_config['model_endpoints']) > 0: # prepare for prediction es_tweet_obj['text_for_prediction'] = {'text': pt.get_text(anonymize=True), 'id': tweet_id} es_queue.push(json.dumps(es_tweet_obj).encode(), project)