Пример #1
0
def _prepare_dataset_view(dataset_params,
                          clear_cache=False,
                          full_dataset=False):
    context = _dataset_params_to_context(dataset_params)
    tweet_limit = 0
    try:
        tweet_limit = int(dataset_params.get('tweet_limit', '0') or '0')
    except ValueError:
        pass

    search_context = _search_to_search_context(
        dataset_params_to_search(dataset_params),
        dataset_params,
        tweet_limit=tweet_limit,
        clear_cache=clear_cache)
    context.update(search_context)
    context['max_rows'] = app.config['MAX_TOP_ROWS_DS_STATS']
    context['sample_tweet_html'] = []
    oembed_error = False
    for tweet_id in context['sample_tweet_ids']:
        if not oembed_error:
            try:
                tweet_html = _oembed(tweet_id, clear_cache=clear_cache)
                if tweet_html:
                    context['sample_tweet_html'].append(tweet_html)
            except OembedException:
                # Skip further Oembed attemts
                oembed_error = True
    source_dataset = DatasetDocument.get(dataset_params['source_dataset'])
    context['source_dataset'] = source_dataset
    dataset_created_at_min = None
    dataset_created_at_max = None
    if source_dataset.first_tweet_created_at:
        if dataset_created_at_min:
            dataset_created_at_min = min(dataset_created_at_min,
                                         source_dataset.first_tweet_created_at)
        else:
            dataset_created_at_min = source_dataset.first_tweet_created_at
    if source_dataset.last_tweet_created_at:
        if dataset_created_at_max:
            dataset_created_at_max = max(dataset_created_at_max,
                                         source_dataset.last_tweet_created_at)
        else:
            dataset_created_at_max = source_dataset.last_tweet_created_at
    context['dataset_created_at_min'] = dataset_created_at_min
    context['dataset_created_at_max'] = dataset_created_at_max

    # Previous datasets
    context['prev_datasets'] = json.loads(
        request.cookies.get('prev_datasets', '[]'))

    # Mode
    context['is_local_mode'] = _is_local_mode(request)
    # Whether a full extract
    context['full_dataset'] = full_dataset
    return context
Пример #2
0
def fetch_by_screen_name(screen_name, source_datasets):
    search = dataset_params_to_search(
        {
            'source_datasets': source_datasets,
            'tweet_type_original': 'true',
            'tweet_type_reply': 'true',
            'tweet_type_retweet': 'true',
            'tweet_type_quote': 'true',
            'poster_any': screen_name.lstrip('@')
        },
        skip_aggs=True)
    search.source(['tweet'])

    for hit in search.scan():
        yield json.loads(hit.tweet)
Пример #3
0
def fetch_by_source_screen_name(screen_name, source_datasets):
    search = dataset_params_to_search(
        {
            'source_datasets': source_datasets,
            'tweet_type_retweet': 'true',
            'tweet_type_quote': 'true',
            'source_poster_any': screen_name.lstrip('@')
        },
        skip_aggs=True)
    search.source(['tweet'])

    for hit in search.scan():
        tweet = json.loads(hit.tweet)
        if 'retweeted_status' in tweet:
            yield tweet['retweeted_status']
        elif 'quoted_status' in tweet:
            yield tweet['quoted_status']
Пример #4
0
 def get_tweet_count(dataset_params):
     '''Retrieve total count of Tweets in dataset from ES index, using the dataset ID provided as a field in dataset_params.'''
     search = dataset_params_to_search(dataset_params, skip_aggs=True)
     search_response = search.execute()
     return search_response.hits.total.value
Пример #5
0
def generate_tasks(self,
                   task_defs,
                   dataset_params,
                   total_tweets,
                   dataset_path,
                   generate_update_increment=None,
                   zip_bytes_threshold=1000000000):
    generate_update_increment = generate_update_increment or 10000
    tasks = []
    task_args = [self, total_tweets, dataset_path, generate_update_increment]
    for task_name, task_kwargs in task_defs.items():
        if task_name in task_class_map:
            tasks.append(task_class_map[task_name](*task_args, **task_kwargs))
    search = dataset_params_to_search(dataset_params)
    source = set()

    for task in tasks:
        # Delete existing files
        if task.file_filter:
            # Unzipped files
            for filename in fnmatch.filter(os.listdir(dataset_path),
                                           task.file_filter):
                os.remove(os.path.join(dataset_path, filename))
            # Zipped files
            for filename in fnmatch.filter(os.listdir(dataset_path),
                                           '{}.zip'.format(task.file_filter)):
                os.remove(os.path.join(dataset_path, filename))

        task.on_start()
        source.update(task.source)

    if source:
        search.source(list(source))

    tweet_count = 0
    for tweet_count, hit in enumerate(search.scan()):
        # This is to support limiting the number of tweets
        if tweet_count + 1 > total_tweets:
            break

        for task in tasks:
            task.on_hit(hit, tweet_count)

        if (tweet_count + 1) % generate_update_increment == 0:
            self.update_state(state='PROGRESS',
                              meta={
                                  'current':
                                  tweet_count + 1,
                                  'total':
                                  total_tweets,
                                  'status':
                                  '{:,d} of {:,d} tweet ids'.format(
                                      tweet_count + 1, total_tweets)
                              })

    for task in tasks:
        task.on_end()

        # Zip files
        z = None
        zip_filepath = None
        file_count = 1
        for filename in sorted(
                fnmatch.filter(os.listdir(dataset_path), task.file_filter)):
            if z is None or os.path.getsize(
                    zip_filepath) > zip_bytes_threshold:
                if z:
                    z.close()
                zip_filepath = os.path.join(
                    dataset_path, '{}.zip'.format(
                        task.file_filter.replace('*',
                                                 str(file_count).zfill(3))))
                z = zipfile.ZipFile(zip_filepath,
                                    'w',
                                    compression=zipfile.ZIP_DEFLATED)
                file_count += 1
            filepath = os.path.join(dataset_path, filename)
            z.write(filepath, arcname=filename)
            os.remove(os.path.join(dataset_path, filename))
        if z:
            z.close()

    generate_task_filepath = os.path.join(dataset_path, 'generate_tasks.json')
    if os.path.exists(generate_task_filepath):
        os.remove(generate_task_filepath)
    # Notify user if email provided
    if task_defs.get('requester_email'):
        send_email(email_address=task_defs['requester_email'],
                   dataset_name=dataset_params['dataset_name'],
                   url_for_extract=task_defs['dataset_url'])

    return {
        'current': tweet_count + 1,
        'total': total_tweets,
        'status': 'Completed.'
    }