Exemplo n.º 1
0
def schedule_username(username, site, group_id, total, tracker_id, test=False):
    '''
    Queue a job to fetch results for the specified username from the specified
    site.

    Keyword arguments:
    test -- don't archive, update site with result (default: False)
    '''

    kwargs = {
        'username': username,
        'site_id': site.id,
        'group_id': group_id,
        'total': total,
        'tracker_id': tracker_id,
        'test': test
    }

    job = _scrape_queue.enqueue_call(func=worker.scrape.check_username,
                                     kwargs=kwargs,
                                     timeout=_redis_worker['username_timeout'])

    description = 'Checking {} for user "{}"'.format(site.name, username)

    worker.init_job(job=job, description=description)

    return job.id
Exemplo n.º 2
0
def schedule_profile_id(site, upstream_id, profile_id=None, stub=False):
    """ Queue a job to fetch the specified profile from a social media site. """

    job = _scrape_queue.enqueue_call(func=worker.scrape.scrape_profile_by_id,
                                     args=(site, [upstream_id], stub),
                                     timeout=_redis_worker['profile_timeout'])

    description = 'Scraping bio for "{}" on {}'.format(upstream_id, site)
    worker.init_job(job=job, description=description, profile_id=profile_id)
Exemplo n.º 3
0
def schedule_archive(username, group_id, tracker_id):
    ''' Queue a job to archive results for the job id. '''

    job = _archive_queue.enqueue_call(func=worker.archive.create_archive,
                                      args=[username, group_id, tracker_id],
                                      timeout=_redis_worker['archive_timeout'])

    description = 'Archiving results for username "{}"'.format(username)

    worker.init_job(job=job, description=description)
Exemplo n.º 4
0
def schedule_sleep_indeterminate(period):
    """ Schedule an indeterminate sleep task (useful for testing). """

    description = 'Indeterminate sleep for {} seconds'.format(period)

    job = _scrape_queue.enqueue(worker.sleep.sleep_indeterminate,
                                period,
                                timeout=period + 1)

    worker.init_job(job, description)
Exemplo n.º 5
0
def schedule_sleep_exception(period):
    """ Schedule a sleep task that raises an exception (useful for testing). """

    description = 'Exception sleep for {} seconds'.format(period)

    job = _scrape_queue.enqueue(worker.sleep.sleep_exception,
                                period,
                                timeout=period + 1)

    worker.init_job(job, description)
Exemplo n.º 6
0
def schedule_index_posts(post_ids):
    """ Queue a job to index the specified posts. """

    job = _index_queue.enqueue_call(func=worker.index.index_posts,
                                    kwargs={'post_ids': post_ids},
                                    timeout=_redis_worker['solr_timeout'])

    description = 'Indexing {} posts' \
                  .format(len(post_ids))

    worker.init_job(job=job, description=description)
Exemplo n.º 7
0
def schedule_index_profile(profile):
    """ Queue a job to index the specified profile. """

    job = _index_queue.enqueue_call(func=worker.index.index_profile,
                                    args=[profile.id],
                                    timeout=_redis_worker['solr_timeout'])

    description = 'Indexing profile "{}" on {}' \
                  .format(profile.username, profile.site_name())

    worker.init_job(job=job, description=description)
Exemplo n.º 8
0
def schedule_delete_profile_from_index(profile_id):
    """ Queue a job to index the specified profile. """

    job = _index_queue.enqueue_call(func=worker.index.delete_profile,
                                    args=[profile_id],
                                    timeout=_redis_worker['solr_timeout'])

    description = 'Deleting profile "{}" from index' \
                  .format(profile_id)

    worker.init_job(job=job, description=description)
Exemplo n.º 9
0
def schedule_avatar(profile, avatar_url):
    """ Queue a job to fetch an avatar image for the specified profile. """

    job = _scrape_queue.enqueue_call(func=worker.scrape.scrape_avatar,
                                     args=(profile.id, profile.site,
                                           avatar_url),
                                     timeout=_redis_worker['avatar_timeout'])

    description = 'Getting avatar image for "{}" on {}' \
                  .format(profile.username, profile.site_name())

    worker.init_job(job=job, description=description, profile_id=profile.id)
Exemplo n.º 10
0
def schedule_delete_profile_from_index(profile):
    """ 
    Queue a job to delete the specified profile from the index.
    """

    job = _index_queue.enqueue_call(func=worker.index.delete_profile,
                                    args=[profile.id],
                                    timeout=_redis_worker['solr_timeout'])

    description = 'Deleting profile "{}" on {} from index' \
                  .format(profile.username, profile.site_name())

    worker.init_job(job=job, description=description)
Exemplo n.º 11
0
def schedule_site_test(site, tracker_id):
    '''
    Queue a job to test a site.

    Arguments:
    site -- the site to test.
    tracker_id -- the unique tracker ID for the job.
    '''

    job = _scrape_queue.enqueue_call(func=worker.scrape.test_site,
                                     args=[site.id, tracker_id],
                                     timeout=30)

    description = 'Testing site "{}"'.format(site.name)

    worker.init_job(job=job, description=description)

    return job.id
Exemplo n.º 12
0
def schedule_posts(profile, recent=True):
    """ Queue a job to get posts for the specified profile. """

    scrapers = {
        'instagram': worker.scrape.scrape_instagram_posts,
        'twitter': worker.scrape.scrape_twitter_posts,
    }

    description = 'Getting posts for "{}" on {}' \
                  .format(profile.username, profile.site_name())
    type_ = 'posts'

    job = _scrape_queue.enqueue_call(func=scrapers[profile.site],
                                     args=(profile.id, recent),
                                     timeout=_redis_worker['posts_timeout'])
    worker.init_job(job=job,
                    description=description,
                    profile_id=profile.id,
                    type_=type_)
Exemplo n.º 13
0
def schedule_relations(profile):
    """ Queue a job to get relations for the specified profile. """

    scrapers = {
        'instagram': worker.scrape.scrape_instagram_relations,
        'twitter': worker.scrape.scrape_twitter_relations,
    }

    description = 'Getting friends & followers for "{}" on {}' \
                  .format(profile.username, profile.site_name())
    type_ = 'relations'

    job = _scrape_queue.enqueue_call(
        func=scrapers[profile.site],
        args=[profile.id],
        timeout=_redis_worker['relations_timeout'])
    worker.init_job(job=job,
                    description=description,
                    profile_id=profile.id,
                    type_=type_)
Exemplo n.º 14
0
def schedule_profiles(profiles, stub=False):
    """
    Queue jobs to fetch a list of profiles from a social media site.

    Profile scraping jobs are chunked according to maximum API request size

    Twitter:
        Supports 100 users per lookup:
        https://dev.twitter.com/rest/reference/get/users/lookup

    Instagram:
        Supports 1 user per lookup:
        https://instagram.com/developer/endpoints/users/#get_users_search

    Parameters:

        'profiles' (list) - A list of profile dictionaries

            Each dictionary specifiies profile username or id and social media
            site name ("twitter", "instagram").

            Example:

                profiles = [
                    {
                        'username': '******',
                        'site': 'twitter',
                    },
                    {
                        'upstream_id': '343432',
                        'site': 'instagram',
                    },
                    ...
                ]

        'stub' (bool) - whether or not to import the profile as a stub
    """

    # Aggregate profiles by site and API request type (username or ID)
    site_profiles = {}
    for profile in profiles:
        if profile['site'] not in site_profiles:
            site_profiles[profile['site']] = {
                'username': [],
                'upstream_id': []
            }
        if 'upstream_id' in profile:
            site_profiles[profile['site']]['upstream_id'].append(profile)
        else:
            site_profiles[profile['site']]['username'].append(profile)

    # Spawn scraping jobs
    for site, type_profiles in site_profiles.items():
        if site == 'twitter':
            chunk_size = 100
        else:
            chunk_size = 1
        # Break jobs into  API request type - username or ID
        for type_, t_profiles in type_profiles.items():
            # Chunk by API request size
            for i in range(0, len(t_profiles), chunk_size):
                chunk = t_profiles[i:i + chunk_size]
                if type_ == 'upstream_id':
                    ids = [i['upstream_id'] for i in chunk]
                    labels = _create_labels_dict(profiles=chunk,
                                                 type_='upstream_id')
                    job = _scrape_queue.enqueue_call(
                        func=worker.scrape.scrape_profile_by_id,
                        args=(site, ids, stub, labels),
                        timeout=_redis_worker['profile_timeout'])
                else:
                    usernames = [i['username'] for i in chunk]
                    labels = _create_labels_dict(profiles=chunk,
                                                 type_='username')
                    job = _scrape_queue.enqueue_call(
                        func=worker.scrape.scrape_profile,
                        args=(site, usernames, stub, labels),
                        timeout=_redis_worker['profile_timeout'])

                description = ('Scraping bios for {} {} profiles'.format(
                    len(chunk), site))

                worker.init_job(job=job, description=description)