def __init__(self, tweet=None):
     self.is_retweet = self._is_retweet(tweet)
     self.tweet = self._get_tweet(tweet)
     self.logger = logging.getLogger(__name__)
     self.stream_config_reader = ProjectConfig()
     self.relevant_text = ''
     self.matching_keywords = {}
def trending_topics_velocity(debug=False):
    logger = get_logger(debug)
    # Cleanup (remove old trending tweets from redis)
    project_config = ProjectConfig()
    for project_config in project_config.read():
        if project_config['compile_trending_topics']:
            tt = TrendingTopics(project_config['slug'])
            tt.update()
def cleanup(debug=False):
    logger = get_logger(debug)
    # Cleanup (remove old trending tweets from redis)
    project_config = ProjectConfig()
    projects = []
    for project_config in project_config.read():
        projects.append(project_config['slug'])
        if project_config['compile_trending_tweets']:
            tt = TrendingTweets(project_config['slug'])
            tt.cleanup()
    # cleanup tweet store
    ts = TweetStore()
    ts.cleanup(projects)
def public_data_dump_ids(debug=False):
    logger = get_logger(debug)
    if config.ENV != 'prd':
        logger.info(
            f'Data dumps are only collected in production environments.')
        return
    project_config = ProjectConfig()
    for project_config in project_config.read():
        if project_config['compile_data_dump_ids']:
            data_dump_ids = DataDumpIds(project_config['slug'])
            data_dump_ids.sync()
            for mode in ['has_place', 'has_coordinates']:
                data_dump_ids = DataDumpIds(project_config['slug'], mode=mode)
                data_dump_ids.sync()
Пример #5
0
def start():
    d = DockerWrapper()
    stream_container_name = app.config['STREAM_DOCKER_CONTAINER_NAME']
    status = d.container_status(stream_container_name)
    if status == 'running':
        return Response("Stream has already started.", status=400, mimetype='text/plain')
    stream_config = ProjectConfig()
    is_valid, response_invalid = stream_config.validate_streaming_config()
    if not is_valid:
        return Response(response_invalid, status=400, mimetype='text/plain')
    d.unpause_container(stream_container_name)
    status = d.container_status(stream_container_name)
    if status == 'running':
        return Response("Successfully started stream.", status=200, mimetype='text/plain')
    else:
        return Response("Starting stream was not successful ", status=400, mimetype='text/plain')
Пример #6
0
def stream_activity():
    es_activity_threshold_min = int(request.args.get('es_activity_threshold_min', 10))
    redis_counts_threshold_hours = int(request.args.get('redis_counts_threshold_hours', 2))
    # elasticsearch counts
    es_count = es.count_recent_documents(since='now-{}m'.format(es_activity_threshold_min))
    # redis counts
    e = datetime.now()
    s = e - timedelta(hours=redis_counts_threshold_hours)
    redis_s3_queue = RedisS3Queue()
    stream_config_reader = ProjectConfig()
    dates = list(redis_s3_queue.daterange(s, e, hourly=True))
    redis_count = 0
    for stream in stream_config_reader.read():
        for d in dates:
            d, h = d.split(':')
            redis_count += redis_s3_queue.get_counts(stream['slug'], d, h)
    return jsonify({'redis_count': redis_count, 'es_count': es_count})
Пример #7
0
def get_trending_tweets(project):
    args = request.get_json()
    if args is None:
        args = {}
    num_tweets = args.get('num_tweets', 10)
    min_score = args.get('min_score', 5)
    sample_from = args.get('sample_from', 100)
    query = args.get('query', '')
    pc = ProjectConfig()
    project_config = pc.get_config_by_slug(project)
    if project_config is None:
        return error_response(400, 'No project found with this slug')
    if not project_config['compile_trending_tweets']:
        return error_response(400, 'This project is configured to not collect trending tweets information.')
    tt = TrendingTweets(project, es_index_name=project_config['es_index_name'])
    resp = tt.get_trending_tweets(num_tweets, query=query, sample_from=sample_from, min_score=min_score)
    return jsonify(resp)
Пример #8
0
def get_trending_topics(project):
    args = request.get_json()
    if args is None:
        args = {}
    num_topics = args.get('num_topics', 10)
    pc = ProjectConfig()
    project_config = pc.get_config_by_slug(project)
    if project_config is None:
        return error_response(400, 'No project found with this slug')
    if not project_config['compile_trending_topics']:
        return error_response(400, 'This project is configured to not collect trending topic information.')
    tt = TrendingTopics(project)
    try:
        resp = tt.get_trending_topics(num_topics)
    except:
        return jsonify([])
    return jsonify(resp)
def es_bulk_index(debug=True):
    logger = get_logger(debug)
    es_queue = ESQueue()
    project_config = ProjectConfig()
    project_keys = es_queue.find_projects_in_queue()
    if len(project_keys) == 0:
        logger.info('No work available. Goodbye!')
        return
    predictions_by_project = {}
    es_actions = []
    for key in project_keys:
        es_queue_objs = es_queue.pop_all(key)
        if len(es_queue_objs) == 0:
            continue
        project = key.decode().split(':')[-1]
        logger.info(
            f'Found {len(es_queue_objs):,} tweets in queue for project {project}.'
        )
        stream_config = project_config.get_config_by_slug(project)
        # compile actions for bulk indexing
        es_queue_objs = [json.loads(t.decode()) for t in es_queue_objs]
        actions = [{
            '_id': t['id'],
            '_type': 'tweet',
            '_source': t['processed_tweet'],
            '_index': stream_config['es_index_name']
        } for t in es_queue_objs]
        es_actions.extend(actions)
        # compile predictions to be added to prediction queue after indexing
        predictions_by_project[project] = [
            t['text_for_prediction'] for t in es_queue_objs
            if 'text_for_prediction' in t
        ]
    # bulk index
    if len(es_actions) > 0:
        success = es.bulk_actions_in_batches(es_actions, batch_size=1000)
        if not success:
            # dump data to disk
            es_queue.dump_to_disk(es_actions, 'es_bulk_indexing_errors')
            return
        # Queue up for prediction
        for project, objs_to_predict in predictions_by_project.items():
            predict_queue = PredictQueue(project)
            predict_queue.multi_push(objs_to_predict)
Пример #10
0
 def _get_projects_stats(self, num_days=7, hourly=False):
     project_config = ProjectConfig()
     redis_s3_queue = RedisS3Queue()
     end_day = datetime.utcnow()
     start_day = end_day - timedelta(days=num_days)
     stats = ''
     dates = list(
         redis_s3_queue.daterange(start_day, end_day, hourly=hourly))
     now_utc = pytz.utc.localize(end_day)
     timezone_hour_delta = get_tz_difference()
     total = defaultdict(lambda: 0)
     for stream in project_config.read():
         total_by_project = defaultdict(lambda: 0)
         project = stream['es_index_name']
         project_slug = stream['slug']
         stats += "<h3>{}</h3>".format(project)
         count_types = ['tweets']
         if stream['image_storage_mode'] != 'inactive':
             count_types += ['photo', 'animated_gif']
         for count_type in count_types:
             stats += '<h4>{}</h4>'.format(count_type)
             for d in dates:
                 if hourly:
                     d, h = d.split(':')
                     count = redis_s3_queue.get_counts(
                         project_slug, d, h, media_type=count_type)
                     corrected_hour = (datetime.strptime(h, '%H') -
                                       timezone_hour_delta).strftime('%H')
                     stats += '{0} ({1}:00 - {1}:59): {2:,}<br>'.format(
                         d, corrected_hour, count)
                 else:
                     count = redis_s3_queue.get_counts(
                         project_slug, d, media_type=count_type)
                     stats += '{}: {:,}<br>'.format(d, count)
                 total[count_type] += count
                 total_by_project[count_type] += count
             stats += 'Total: {:,}<br><br>'.format(
                 total_by_project[count_type])
     total_stats = 'Total {}:'.format('today' if hourly else 'this week')
     total_stats += '<ul>'
     for count_type, count in total.items():
         total_stats += "<li>{}: {:,}</li>".format(count_type, count)
     total_stats += '</ul>'
     return stats, total_stats
Пример #11
0
def send_to_s3(debug=False):
    logger = get_logger(debug)
    s3_handler = S3Handler()
    redis_queue = RedisS3Queue()
    logger.info('Pushing tweets to S3')
    project_keys = redis_queue.find_projects_in_queue()
    project_config = ProjectConfig()
    if len(project_keys) == 0:
        logger.info('No work available. Goodbye!')
        return
    for key in project_keys:
        project = key.decode().split(':')[-1]
        logger.info('Found {:,} new tweet(s) in project {}'.format(
            redis_queue.num_elements_in_queue(key), project))
        stream_config = project_config.get_config_by_slug(project)
        now = datetime.datetime.now()
        f_name = 'tweets-{}-{}.jsonl'.format(now.strftime("%Y%m%d%H%M%S"),
                                             str(uuid.uuid4()))
        # dump data from redis into a temporary file
        tmp_file_path = os.path.join(os.path.join('/', 'tmp', f_name))
        with open(tmp_file_path, 'wb') as f:
            for tweets in redis_queue.pop_all_iter(key):
                f.write(b'\n'.join(tweets) + b'\n')
        # compress temporary file
        f_name_gz = f_name + '.gz'
        tmp_file_path_gz = os.path.join(os.path.join('/', 'tmp', f_name_gz))
        compress(tmp_file_path, tmp_file_path_gz)
        os.remove(tmp_file_path)
        # upload to S3
        s3_key = 'tweets/{}/{}/{}'.format(stream_config['es_index_name'],
                                          now.strftime("%Y-%m-%d"), f_name_gz)
        if s3_handler.upload_file(tmp_file_path_gz, s3_key):
            logging.info(f'Successfully uploaded file {s3_key} to S3')
            os.remove(tmp_file_path_gz)
        else:
            logging.error(
                f'ERROR: Upload of file {s3_key} to S3 not successful')
class StreamManager():
    def __init__(self, auth, listener, chunk_size=1536):
        # High chunk_size means lower latency but higher processing efficiency
        self.logger = logging.getLogger('stream')
        self.stream = Stream(auth=auth, listener=listener, tweet_mode='extended', parser=tweepy.parsers.JSONParser(), chunk_size=chunk_size)
        self.stream_config = ProjectConfig()

    def start(self):
        config = self.stream_config.get_pooled_config()
        self.logger.info('Starting to track for keywords {} in languages {}'.format(config['keywords'], config['lang']))
        self.stream.filter(track=config['keywords'], languages=config['lang'], encoding='utf-8', stall_warnings=True)

    def stop(self):
        self.logger.info('Stopping stream...')
        try:
            self.stream.disconnect()
        except:
            pass
Пример #13
0
def manage_config():
    logger = logging.getLogger('pipeline')
    config = request.get_json()
    pc = ProjectConfig()
    if request.method == 'GET':
        # read streaming config
        config = pc.read()
        return jsonify(config), 200
    else:
        # write streaming config
        # make sure new configuration is valid
        is_valid, msg = pc.is_valid(config)
        if not is_valid:
            return error_response(400, msg)
        # write everything to config
        pc.write(config)
        # Create new Elasticsearch indices if needed
        es.update_es_indices(pc.get_es_index_names(config))
        return success_response(200, 'Successfully updated configuration files. Make sure to restart stream for changes to be active.')
Пример #14
0
def es_predict(debug=True):
    logger = get_logger(debug)
    project_config = ProjectConfig()
    predictions = {}
    for project_config in project_config.read():
        if len(project_config['model_endpoints']) > 0:
            project = project_config['slug']
            predict_queue = PredictQueue(project)
            predict_objs = predict_queue.pop_all()
            if len(predict_objs) == 0:
                logger.info(f'Nothing to predict for project {project}')
            texts = [t['text'] for t in predict_objs]
            ids = [t['id'] for t in predict_objs]
            es_index_name = project_config['es_index_name']
            for question_tag, endpoints_obj in project_config[
                    'model_endpoints'].items():
                for endpoint_name, endpoint_info in endpoints_obj[
                        'active'].items():
                    model_type = endpoint_info['model_type']
                    run_name = endpoint_info['run_name']
                    predictor = Predict(endpoint_name, model_type)
                    preds = predictor.predict(texts)
                    for _id, _pred in zip(ids, preds):
                        if es_index_name not in predictions:
                            predictions[es_index_name] = {}
                        if _id not in predictions[es_index_name]:
                            predictions[es_index_name][_id] = {}
                        if question_tag not in predictions[es_index_name][_id]:
                            predictions[es_index_name][_id][question_tag] = {
                                'endpoints': {}
                            }
                        predictions[es_index_name][_id][question_tag][
                            'endpoints'][run_name] = {
                                'label': _pred['labels'][0],
                                'probability': _pred['probabilities'][0]
                            }
                        # if present, add label vals (numeric values of labels)
                        if 'label_vals' in _pred:
                            predictions[es_index_name][_id][question_tag][
                                'endpoints'][run_name]['label_val'] = _pred[
                                    'label_vals'][0]
                        if endpoints_obj['primary'] == endpoint_name:
                            # current endpoint is primary endpoint
                            predictions[es_index_name][_id][question_tag][
                                'primary_endpoint'] = run_name
                            predictions[es_index_name][_id][question_tag][
                                'primary_label'] = _pred['labels'][0]
                            if 'label_vals' in _pred:
                                predictions[es_index_name][_id][question_tag][
                                    'primary_label_val'] = _pred['label_vals'][
                                        0]
    if len(predictions) > 0:
        actions = []
        for es_index_name, pred_es_index in predictions.items():
            for _id, pred_obj in pred_es_index.items():
                actions.append({
                    '_id': _id,
                    '_type': 'tweet',
                    '_op_type': 'update',
                    '_index': es_index_name,
                    '_source': {
                        'doc': {
                            'meta': pred_obj
                        }
                    }
                })
        success = es.bulk_actions_in_batches(actions)
        if not success:
            # dump data to disk
            es_queue = ESQueue()
            es_queue.dump_to_disk(actions, 'es_bulk_update_errors')
class ReverseTweetMatcher():
    """Tries to reverse match a tweet object given a set of keyword lists and languages."""

    def __init__(self, tweet=None):
        self.is_retweet = self._is_retweet(tweet)
        self.tweet = self._get_tweet(tweet)
        self.logger = logging.getLogger(__name__)
        self.stream_config_reader = ProjectConfig()
        self.relevant_text = ''
        self.matching_keywords = {}

    def get_candidates(self):
        relevant_text = self.fetch_all_relevant_text()
        config = self.stream_config_reader.read()
        if len(config) == 0:
            return []
        elif len(config) == 1:
            # only one possibility
            self._find_matching_keywords_for_project(relevant_text, config[0])
            return [config[0]['slug']]
        else:
            # try to match to configs
            return self._match_to_config(relevant_text, config)

    def fetch_all_relevant_text(self):
        """Here we pool all relevant text within the tweet to do the matching. From the twitter docs:
        "Specifically, the text attribute of the Tweet, expanded_url and display_url for links and media, text for hashtags, and screen_name for user mentions are checked for matches."
        https://developer.twitter.com/en/docs/tweets/filter-realtime/guides/basic-stream-parameters.html
        """
        text = ''
        if 'extended_tweet' in self.tweet:
            text += self.tweet['extended_tweet']['full_text']
            text += self._fetch_user_mentions(self.tweet['extended_tweet'])
            text += self._fetch_urls(self.tweet['extended_tweet'])
        else:
            text += self.tweet['text']
            text += self._fetch_user_mentions(self.tweet)
            text += self._fetch_urls(self.tweet)

        # pool together with text from quoted tweet
        if 'quoted_status' in self.tweet:
            if 'extended_tweet' in self.tweet['quoted_status']:
                text += self.tweet['quoted_status']['extended_tweet']['full_text']
                text += self._fetch_user_mentions(self.tweet['quoted_status']['extended_tweet'])
                text += self._fetch_urls(self.tweet['quoted_status']['extended_tweet'])
            else:
                text += self.tweet['quoted_status']['text']
                text += self._fetch_user_mentions(self.tweet['quoted_status'])
                text += self._fetch_urls(self.tweet['quoted_status'])

        # store as member for debugging use
        self.relevant_text = text
        return text

    # private methods

    def _find_matching_keywords_for_project(self, relevant_text, config):
        """Find matching_keywords"""
        relevant_text = relevant_text.lower()
        keywords = [k.lower().split() for k in config['keywords']]
        matching_keywords = defaultdict(list)
        for keyword_list in keywords:
            if len(keyword_list) == 1:
                if keyword_list[0] in relevant_text:
                    matching_keywords[config['slug']].append(keyword_list[0])
            else:
                # keywords with more than one word: Check if all words are contained in text
                match_result = re.findall(r'{}'.format('|'.join(keyword_list)), relevant_text)
                if set(match_result) == set(keyword_list):
                    matching_keywords[config['slug']].extend(keyword_list)
        self.matching_keywords = dict(matching_keywords)

    def _match_to_config(self, relevant_text, config):
        """Match text to config in stream"""
        relevant_text = relevant_text.lower()
        match_candidates = set()
        matching_keywords_by_project = defaultdict(list)
        for c in config:
            # else find match for keywords to relevant text
            keywords = [k.lower().split() for k in c['keywords']]
            for keyword_list in keywords:
                if len(keyword_list) == 1:
                    if keyword_list[0] in relevant_text:
                        match_candidates.add(c['slug'])
                        matching_keywords_by_project[c['slug']].append(keyword_list[0])
                else:
                    # keywords with more than one word: Check if all words are contained in text
                    match_result = re.findall(r'{}'.format('|'.join(keyword_list)), relevant_text)
                    if set(match_result) == set(keyword_list):
                        match_candidates.add(c['slug'])
                        matching_keywords_by_project[c['slug']].extend(keyword_list)
        # filter by language setting
        config_dict = {c['slug']: c for c in config}
        lang_tweet = self.tweet['lang']
        candidates = set()
        for c in match_candidates:
            languages = config_dict[c]['lang']
            # add as match if language matches, no language was specified or language could not be detected by Twitter ('und')
            if lang_tweet in languages or len(languages) == 0 or lang_tweet == 'und':
                self.matching_keywords[c] = matching_keywords_by_project[c]
                candidates.add(c)
        return list(candidates)

    def _fetch_urls(self, obj):
        t = []
        if 'urls' in obj['entities']:
            for u in obj['entities']['urls']:
                t.append(u['expanded_url'])

        if 'extended_entities' in obj:
            if 'media' in obj['extended_entities']:
                for m in obj['extended_entities']['media']:
                    t.append(m['expanded_url'])
        return ''.join(t)

    def _fetch_user_mentions(self, obj):
        t = []
        if 'user_mentions' in obj['entities']:
            for user_mention in obj['entities']['user_mentions']:
                t.append(user_mention['screen_name'])
        return ''.join(t)

    def _get_tweet(self, tweet):
        if self.is_retweet:
            return tweet['retweeted_status']
        else:
            return tweet


    def _is_retweet(self, tweet):
        return 'retweeted_status' in tweet
def main(args):
    f_path = os.path.join(args.input)
    docs = []
    num_docs = 0
    with open(f_path, 'r') as f:
        for line in f:
            doc = json.loads(line)
            try:
                docs.append({
                    'id': doc['_id'],
                    'text': process(doc['_source']['text'])
                })
            except KeyError:
                logger.warning(f'Doc {doc} is missing text/id column')
                continue
            num_docs += 1
            if num_docs % 10000 == 0:
                logger.info(f'Loaded {num_docs:,} documents...')
    if len(docs) == 0:
        logger.info('No documents loaded.')
        return
    pc = ProjectConfig()
    session = boto3.Session(profile_name='crowdbreaks-dev')
    project_config = pc.get_config_by_index_name(args.index)
    if project_config is None:
        raise ValueError(f'Project {args.index} not found in config file.')
    predictions = {}
    if len(project_config['model_endpoints']) > 0:
        project = project_config['slug']
        texts = [t['text'] for t in docs]
        ids = [t['id'] for t in docs]
        es_index_name = project_config['es_index_name']
        for question_tag, endpoints_obj in project_config[
                'model_endpoints'].items():
            for endpoint_name, endpoint_info in endpoints_obj['active'].items(
            ):
                model_type = endpoint_info['model_type']
                run_name = endpoint_info['run_name']
                logger.info(f'Running predictions for run {run_name}')
                model = get_model(args.run_dir, run_name)
                label_mapping = get_label_mapping(args.run_dir, run_name)
                preds = predict(
                    model,
                    label_mapping,
                    texts,
                    legacy=(run_name == 'fasttext_v1'
                            and args.index == 'project_vaccine_sentiment'))
                for _id, _pred in zip(ids, preds):
                    if es_index_name not in predictions:
                        predictions[es_index_name] = {}
                    if _id not in predictions[es_index_name]:
                        predictions[es_index_name][_id] = {}
                    if question_tag not in predictions[es_index_name][_id]:
                        predictions[es_index_name][_id][question_tag] = {
                            'endpoints': {}
                        }
                    predictions[es_index_name][_id][question_tag]['endpoints'][
                        run_name] = {
                            'label': _pred['labels'][0],
                            'probability': _pred['probabilities'][0]
                        }
                    # if present, add label vals (numeric values of labels)
                    if 'label_vals' in _pred:
                        predictions[es_index_name][_id][question_tag][
                            'endpoints'][run_name]['label_val'] = _pred[
                                'label_vals'][0]
                    if endpoints_obj['primary'] == endpoint_name:
                        # current endpoint is primary endpoint
                        predictions[es_index_name][_id][question_tag][
                            'primary_endpoint'] = run_name
                        predictions[es_index_name][_id][question_tag][
                            'primary_label'] = _pred['labels'][0]
                        if 'label_vals' in _pred:
                            predictions[es_index_name][_id][question_tag][
                                'primary_label_val'] = _pred['label_vals'][0]

    if len(predictions) > 0:
        ts = int(time.time())
        f_out = os.path.join('cache', f'predictions_{args.index}_{ts}.jsonl')
        logger.info(f'Writing predictions to file {f_out}...')
        with open(f_out, 'a') as f:
            for es_index_name, pred_es_index in predictions.items():
                for _id, pred_obj in pred_es_index.items():
                    f.write(
                        json.dumps({
                            '_id': _id,
                            '_type': 'tweet',
                            '_op_type': 'update',
                            '_index': es_index_name,
                            '_source': {
                                'doc': {
                                    'meta': pred_obj
                                }
                            }
                        }) + '\n')
    else:
        logger.info('No predictions were made. No files written.')
 def __init__(self, auth, listener, chunk_size=1536):
     # High chunk_size means lower latency but higher processing efficiency
     self.logger = logging.getLogger('stream')
     self.stream = Stream(auth=auth, listener=listener, tweet_mode='extended', parser=tweepy.parsers.JSONParser(), chunk_size=chunk_size)
     self.stream_config = ProjectConfig()
Пример #18
0
def handle_tweet(tweet, send_to_es=True, use_pq=True, debug=False, store_unmatched_tweets=False):
    logger = get_task_logger(__name__)
    if debug:
        logger.setLevel(logging.DEBUG)
    # reverse match to find project
    rtm = ReverseTweetMatcher(tweet=tweet)
    candidates = rtm.get_candidates()
    tweet_id = tweet['id_str']
    # open Redis connection only once
    # redis = Redis()
    # connection = redis.get_connection()
    connection = None
    if len(candidates) == 0:
        # Could not match keywords. This might occur quite frequently e.g. when tweets are collected accross different languages/keywords
        logger.info(f'Tweet {tweet_id} could not be matched against any existing projects.')
        if store_unmatched_tweets:
            # store to separate file for later analysis
            with open(os.path.join(config.PROJECT_ROOT, 'logs', 'reverse_match_errors', f'{tweet_id}.json'), 'w') as f:
                json.dump(tweet, f)
        return
    # queue up for s3 upload and add to priority queue
    logger.info("SUCCESS: Found {} project(s) ({}) as a matching project for tweet".format(len(candidates), ', '.join(candidates)))
    redis_queue = RedisS3Queue(connection=connection)
    es_queue = ESQueue(connection=connection)
    stream_config_reader = ProjectConfig()
    for project in candidates:
        stream_config = stream_config_reader.get_config_by_slug(project)
        if stream_config['storage_mode'] == 'test_mode':
            logger.debug('Running in test mode. Not sending to S3 or ES.')
            return
        # add tracking info
        tweet['_tracking_info'] = stream_config_reader.get_tracking_info(project)
        tweet['_tracking_info']['matching_keywords'] = rtm.matching_keywords[project]
        # Queue up on Redis for subsequent upload
        redis_queue.push(json.dumps(tweet).encode(), project)
        # preprocess tweet
        pt = ProcessTweet(tweet, project_locales=stream_config['locales'])
        pt.process()
        # Possibly add tweet to trending tweets
        if stream_config['compile_trending_tweets']:
            trending_tweets = TrendingTweets(project, project_locales=stream_config['locales'], connection=connection)
            trending_tweets.process(tweet)
        # Extract trending topics
        if stream_config['compile_trending_topics']:
            trending_topics = TrendingTopics(project, project_locales=stream_config['locales'], project_keywords=stream_config['keywords'], connection=connection)
            trending_topics.process(tweet)
        if stream_config['compile_data_dump_ids'] and config.ENV == 'prd':
            data_dump_ids = DataDumpIds(project, connection=connection)
            data_dump_ids.add(tweet_id)
            if pt.has_place:
                data_dump_ids = DataDumpIds(project, mode='has_place', connection=connection)
                data_dump_ids.add(tweet_id)
            if pt.has_coordinates:
                data_dump_ids = DataDumpIds(project, mode='has_coordinates', connection=connection)
                data_dump_ids.add(tweet_id)
        if use_pq and pt.should_be_annotated():
            # add to Tweet ID queue for crowd labelling
            logger.info(f'Add tweet {tweet_id} to priority queue...')
            processed_tweet = pt.get_processed_tweet()
            tid = TweetIdQueue(stream_config['es_index_name'], priority_threshold=3, connection=connection)
            processed_tweet['text'] = pt.get_text(anonymize=True)
            tid.add_tweet(tweet_id, processed_tweet, priority=0)
        if stream_config['image_storage_mode'] != 'inactive':
            pm = ProcessMedia(tweet, project, image_storage_mode=stream_config['image_storage_mode'])
            pm.process()
        if send_to_es and stream_config['storage_mode'] in ['s3-es', 's3-es-no-retweets']:
            if rtm.is_retweet and stream_config['storage_mode'] == 's3-es-no-retweets':
                # Do not store retweets on ES
                return
            # send to ES
            processed_tweet = pt.get_processed_tweet()
            logger.debug(f'Pushing processed with id {tweet_id} to ES queue')
            es_tweet_obj = {'processed_tweet': processed_tweet, 'id': tweet_id}
            if len(stream_config['model_endpoints']) > 0:
                # prepare for prediction
                es_tweet_obj['text_for_prediction'] = {'text': pt.get_text(anonymize=True), 'id': tweet_id}
            es_queue.push(json.dumps(es_tweet_obj).encode(), project)