def trending_topics_velocity(debug=False):
    logger = get_logger(debug)
    # Cleanup (remove old trending tweets from redis)
    project_config = ProjectConfig()
    for project_config in project_config.read():
        if project_config['compile_trending_topics']:
            tt = TrendingTopics(project_config['slug'])
            tt.update()
def cleanup(debug=False):
    logger = get_logger(debug)
    # Cleanup (remove old trending tweets from redis)
    project_config = ProjectConfig()
    projects = []
    for project_config in project_config.read():
        projects.append(project_config['slug'])
        if project_config['compile_trending_tweets']:
            tt = TrendingTweets(project_config['slug'])
            tt.cleanup()
    # cleanup tweet store
    ts = TweetStore()
    ts.cleanup(projects)
def public_data_dump_ids(debug=False):
    logger = get_logger(debug)
    if config.ENV != 'prd':
        logger.info(
            f'Data dumps are only collected in production environments.')
        return
    project_config = ProjectConfig()
    for project_config in project_config.read():
        if project_config['compile_data_dump_ids']:
            data_dump_ids = DataDumpIds(project_config['slug'])
            data_dump_ids.sync()
            for mode in ['has_place', 'has_coordinates']:
                data_dump_ids = DataDumpIds(project_config['slug'], mode=mode)
                data_dump_ids.sync()
Пример #4
0
def stream_activity():
    es_activity_threshold_min = int(request.args.get('es_activity_threshold_min', 10))
    redis_counts_threshold_hours = int(request.args.get('redis_counts_threshold_hours', 2))
    # elasticsearch counts
    es_count = es.count_recent_documents(since='now-{}m'.format(es_activity_threshold_min))
    # redis counts
    e = datetime.now()
    s = e - timedelta(hours=redis_counts_threshold_hours)
    redis_s3_queue = RedisS3Queue()
    stream_config_reader = ProjectConfig()
    dates = list(redis_s3_queue.daterange(s, e, hourly=True))
    redis_count = 0
    for stream in stream_config_reader.read():
        for d in dates:
            d, h = d.split(':')
            redis_count += redis_s3_queue.get_counts(stream['slug'], d, h)
    return jsonify({'redis_count': redis_count, 'es_count': es_count})
Пример #5
0
 def _get_projects_stats(self, num_days=7, hourly=False):
     project_config = ProjectConfig()
     redis_s3_queue = RedisS3Queue()
     end_day = datetime.utcnow()
     start_day = end_day - timedelta(days=num_days)
     stats = ''
     dates = list(
         redis_s3_queue.daterange(start_day, end_day, hourly=hourly))
     now_utc = pytz.utc.localize(end_day)
     timezone_hour_delta = get_tz_difference()
     total = defaultdict(lambda: 0)
     for stream in project_config.read():
         total_by_project = defaultdict(lambda: 0)
         project = stream['es_index_name']
         project_slug = stream['slug']
         stats += "<h3>{}</h3>".format(project)
         count_types = ['tweets']
         if stream['image_storage_mode'] != 'inactive':
             count_types += ['photo', 'animated_gif']
         for count_type in count_types:
             stats += '<h4>{}</h4>'.format(count_type)
             for d in dates:
                 if hourly:
                     d, h = d.split(':')
                     count = redis_s3_queue.get_counts(
                         project_slug, d, h, media_type=count_type)
                     corrected_hour = (datetime.strptime(h, '%H') -
                                       timezone_hour_delta).strftime('%H')
                     stats += '{0} ({1}:00 - {1}:59): {2:,}<br>'.format(
                         d, corrected_hour, count)
                 else:
                     count = redis_s3_queue.get_counts(
                         project_slug, d, media_type=count_type)
                     stats += '{}: {:,}<br>'.format(d, count)
                 total[count_type] += count
                 total_by_project[count_type] += count
             stats += 'Total: {:,}<br><br>'.format(
                 total_by_project[count_type])
     total_stats = 'Total {}:'.format('today' if hourly else 'this week')
     total_stats += '<ul>'
     for count_type, count in total.items():
         total_stats += "<li>{}: {:,}</li>".format(count_type, count)
     total_stats += '</ul>'
     return stats, total_stats
Пример #6
0
def manage_config():
    logger = logging.getLogger('pipeline')
    config = request.get_json()
    pc = ProjectConfig()
    if request.method == 'GET':
        # read streaming config
        config = pc.read()
        return jsonify(config), 200
    else:
        # write streaming config
        # make sure new configuration is valid
        is_valid, msg = pc.is_valid(config)
        if not is_valid:
            return error_response(400, msg)
        # write everything to config
        pc.write(config)
        # Create new Elasticsearch indices if needed
        es.update_es_indices(pc.get_es_index_names(config))
        return success_response(200, 'Successfully updated configuration files. Make sure to restart stream for changes to be active.')
class ReverseTweetMatcher():
    """Tries to reverse match a tweet object given a set of keyword lists and languages."""

    def __init__(self, tweet=None):
        self.is_retweet = self._is_retweet(tweet)
        self.tweet = self._get_tweet(tweet)
        self.logger = logging.getLogger(__name__)
        self.stream_config_reader = ProjectConfig()
        self.relevant_text = ''
        self.matching_keywords = {}

    def get_candidates(self):
        relevant_text = self.fetch_all_relevant_text()
        config = self.stream_config_reader.read()
        if len(config) == 0:
            return []
        elif len(config) == 1:
            # only one possibility
            self._find_matching_keywords_for_project(relevant_text, config[0])
            return [config[0]['slug']]
        else:
            # try to match to configs
            return self._match_to_config(relevant_text, config)

    def fetch_all_relevant_text(self):
        """Here we pool all relevant text within the tweet to do the matching. From the twitter docs:
        "Specifically, the text attribute of the Tweet, expanded_url and display_url for links and media, text for hashtags, and screen_name for user mentions are checked for matches."
        https://developer.twitter.com/en/docs/tweets/filter-realtime/guides/basic-stream-parameters.html
        """
        text = ''
        if 'extended_tweet' in self.tweet:
            text += self.tweet['extended_tweet']['full_text']
            text += self._fetch_user_mentions(self.tweet['extended_tweet'])
            text += self._fetch_urls(self.tweet['extended_tweet'])
        else:
            text += self.tweet['text']
            text += self._fetch_user_mentions(self.tweet)
            text += self._fetch_urls(self.tweet)

        # pool together with text from quoted tweet
        if 'quoted_status' in self.tweet:
            if 'extended_tweet' in self.tweet['quoted_status']:
                text += self.tweet['quoted_status']['extended_tweet']['full_text']
                text += self._fetch_user_mentions(self.tweet['quoted_status']['extended_tweet'])
                text += self._fetch_urls(self.tweet['quoted_status']['extended_tweet'])
            else:
                text += self.tweet['quoted_status']['text']
                text += self._fetch_user_mentions(self.tweet['quoted_status'])
                text += self._fetch_urls(self.tweet['quoted_status'])

        # store as member for debugging use
        self.relevant_text = text
        return text

    # private methods

    def _find_matching_keywords_for_project(self, relevant_text, config):
        """Find matching_keywords"""
        relevant_text = relevant_text.lower()
        keywords = [k.lower().split() for k in config['keywords']]
        matching_keywords = defaultdict(list)
        for keyword_list in keywords:
            if len(keyword_list) == 1:
                if keyword_list[0] in relevant_text:
                    matching_keywords[config['slug']].append(keyword_list[0])
            else:
                # keywords with more than one word: Check if all words are contained in text
                match_result = re.findall(r'{}'.format('|'.join(keyword_list)), relevant_text)
                if set(match_result) == set(keyword_list):
                    matching_keywords[config['slug']].extend(keyword_list)
        self.matching_keywords = dict(matching_keywords)

    def _match_to_config(self, relevant_text, config):
        """Match text to config in stream"""
        relevant_text = relevant_text.lower()
        match_candidates = set()
        matching_keywords_by_project = defaultdict(list)
        for c in config:
            # else find match for keywords to relevant text
            keywords = [k.lower().split() for k in c['keywords']]
            for keyword_list in keywords:
                if len(keyword_list) == 1:
                    if keyword_list[0] in relevant_text:
                        match_candidates.add(c['slug'])
                        matching_keywords_by_project[c['slug']].append(keyword_list[0])
                else:
                    # keywords with more than one word: Check if all words are contained in text
                    match_result = re.findall(r'{}'.format('|'.join(keyword_list)), relevant_text)
                    if set(match_result) == set(keyword_list):
                        match_candidates.add(c['slug'])
                        matching_keywords_by_project[c['slug']].extend(keyword_list)
        # filter by language setting
        config_dict = {c['slug']: c for c in config}
        lang_tweet = self.tweet['lang']
        candidates = set()
        for c in match_candidates:
            languages = config_dict[c]['lang']
            # add as match if language matches, no language was specified or language could not be detected by Twitter ('und')
            if lang_tweet in languages or len(languages) == 0 or lang_tweet == 'und':
                self.matching_keywords[c] = matching_keywords_by_project[c]
                candidates.add(c)
        return list(candidates)

    def _fetch_urls(self, obj):
        t = []
        if 'urls' in obj['entities']:
            for u in obj['entities']['urls']:
                t.append(u['expanded_url'])

        if 'extended_entities' in obj:
            if 'media' in obj['extended_entities']:
                for m in obj['extended_entities']['media']:
                    t.append(m['expanded_url'])
        return ''.join(t)

    def _fetch_user_mentions(self, obj):
        t = []
        if 'user_mentions' in obj['entities']:
            for user_mention in obj['entities']['user_mentions']:
                t.append(user_mention['screen_name'])
        return ''.join(t)

    def _get_tweet(self, tweet):
        if self.is_retweet:
            return tweet['retweeted_status']
        else:
            return tweet


    def _is_retweet(self, tweet):
        return 'retweeted_status' in tweet
def es_predict(debug=True):
    logger = get_logger(debug)
    project_config = ProjectConfig()
    predictions = {}
    for project_config in project_config.read():
        if len(project_config['model_endpoints']) > 0:
            project = project_config['slug']
            predict_queue = PredictQueue(project)
            predict_objs = predict_queue.pop_all()
            if len(predict_objs) == 0:
                logger.info(f'Nothing to predict for project {project}')
            texts = [t['text'] for t in predict_objs]
            ids = [t['id'] for t in predict_objs]
            es_index_name = project_config['es_index_name']
            for question_tag, endpoints_obj in project_config[
                    'model_endpoints'].items():
                for endpoint_name, endpoint_info in endpoints_obj[
                        'active'].items():
                    model_type = endpoint_info['model_type']
                    run_name = endpoint_info['run_name']
                    predictor = Predict(endpoint_name, model_type)
                    preds = predictor.predict(texts)
                    for _id, _pred in zip(ids, preds):
                        if es_index_name not in predictions:
                            predictions[es_index_name] = {}
                        if _id not in predictions[es_index_name]:
                            predictions[es_index_name][_id] = {}
                        if question_tag not in predictions[es_index_name][_id]:
                            predictions[es_index_name][_id][question_tag] = {
                                'endpoints': {}
                            }
                        predictions[es_index_name][_id][question_tag][
                            'endpoints'][run_name] = {
                                'label': _pred['labels'][0],
                                'probability': _pred['probabilities'][0]
                            }
                        # if present, add label vals (numeric values of labels)
                        if 'label_vals' in _pred:
                            predictions[es_index_name][_id][question_tag][
                                'endpoints'][run_name]['label_val'] = _pred[
                                    'label_vals'][0]
                        if endpoints_obj['primary'] == endpoint_name:
                            # current endpoint is primary endpoint
                            predictions[es_index_name][_id][question_tag][
                                'primary_endpoint'] = run_name
                            predictions[es_index_name][_id][question_tag][
                                'primary_label'] = _pred['labels'][0]
                            if 'label_vals' in _pred:
                                predictions[es_index_name][_id][question_tag][
                                    'primary_label_val'] = _pred['label_vals'][
                                        0]
    if len(predictions) > 0:
        actions = []
        for es_index_name, pred_es_index in predictions.items():
            for _id, pred_obj in pred_es_index.items():
                actions.append({
                    '_id': _id,
                    '_type': 'tweet',
                    '_op_type': 'update',
                    '_index': es_index_name,
                    '_source': {
                        'doc': {
                            'meta': pred_obj
                        }
                    }
                })
        success = es.bulk_actions_in_batches(actions)
        if not success:
            # dump data to disk
            es_queue = ESQueue()
            es_queue.dump_to_disk(actions, 'es_bulk_update_errors')