Exemplo n.º 1
0
def scrape_instagram_account_by_id(upstream_id, stub=False):
    """ Scrape instagram bio data for upstream ID and update a profile. """

    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    # Instagram API request.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']

    # Update the profile.
    data = response.json()[0]
    profile = Profile('instagram', upstream_id, data['screen_name'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == upstream_id) \
                            .one()

    # Update profile
    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()
Exemplo n.º 2
0
def scrape_twitter_account_by_id(upstream_ids, stub=False, labels={}):
    """
    Scrape twitter bio data for upstream IDs and/or updates a profile.
    Accepts twitter ID rather than username.
    """
    if len(upstream_ids) > 100:
        raise ScrapeException('Twitter API max is 100 user IDs per request.')

    db_session = worker.get_session()
    profiles = []

    # Request from Twitter API.
    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    payload = {'user_id': ','.join(upstream_ids)}
    headers = {'ACCEPT-ENCODING': None}
    response = requests.post(
        api_url,
        data=payload,
        proxies=_get_proxies(db_session),
        verify=False,
        headers=TWITTER_HEADERS
    )
    response.raise_for_status()

    # Update the profile.
    for profile_json in response.json():
        profile = Profile(
            'twitter',
            profile_json['id_str'],
            profile_json['screen_name']
        )
        profile.is_stub = stub
        profile.private = profile_json['protected']
        db_session.add(profile)

        try:
            db_session.commit()
        except IntegrityError:
            # Already exists: use the existing profile.
            db_session.rollback()
            profile = db_session.query(Profile) \
                                .filter(Profile.site=='twitter') \
                                .filter(
                                    Profile.upstream_id==profile_json['id_str']
                                )\
                                .one()
            # Profiles already in the system are either not stubs or
            # being updated to full profiles
            profile.is_stub = False


        _twitter_populate_profile(profile_json, profile)

        if profile.upstream_id in labels:
            _label_profile(db_session, profile, labels[profile.upstream_id])

        profile.last_update = datetime.now()
        db_session.commit()
        profiles.append(profile.as_dict())

        # Schedule followup jobs.
        app.queue.schedule_index_profile(profile)
        if not stub:
            app.queue.schedule_avatar(
                profile, profile_json['profile_image_url_https']
            )
            # Only get tweets and relations for unprotected profiles
            if not profile.private:
                app.queue.schedule_posts(profile, recent=True)
                app.queue.schedule_relations(profile)

    return profiles
Exemplo n.º 3
0
def scrape_twitter_account(usernames, stub=False, labels=None):
    """
    Scrape twitter bio data and create (or update) a list of profile
    usernames.

    Keyword arguments:
    stub -- add the profile in stub mode (default False)
    labels -- dictionary of username labels (default None)
    """

    if len(usernames) > 100:
        raise ScrapeException('Twitter API max is 100 user IDs per request.')

    profiles = []
    # Request from Twitter API.
    db_session = worker.get_session()

    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    payload = {'screen_name': ','.join(usernames)}
    headers = {'ACCEPT-ENCODING': None}
    response = requests.post(
        api_url,
        data=payload,
        proxies=_get_proxies(db_session),
        verify=False,
        headers=TWITTER_HEADERS
    )
    response.raise_for_status()

    # Get Twitter ID and upsert the profile.
    for profile_json in response.json():
        user_id = profile_json['id_str']
        profile = Profile('twitter', user_id, profile_json['screen_name'])
        profile.is_stub = stub
        profile.private = profile_json['protected']
        db_session.add(profile)

        try:
            db_session.commit()
        except IntegrityError:
            # Already exists: use the existing profile.
            db_session.rollback()
            profile = db_session.query(Profile) \
                                .filter(Profile.site=='twitter') \
                                .filter(Profile.upstream_id==user_id) \
                                .one()
            # Profiles already in the system are either not stubs or
            # being updated to full profiles
            profile.is_stub = False

        _twitter_populate_profile(profile_json, profile)

        if profile.username.lower() in labels:
            print('Labels: {}'.format(labels), flush=True)
            _label_profile(db_session, profile, labels[profile.username.lower()])

        profile.last_update = datetime.now()
        db_session.commit()
        profiles.append(profile.as_dict())

        # Schedule followup jobs.
        app.queue.schedule_index_profile(profile)

        if not stub:
            app.queue.schedule_avatar(profile, profile_json['profile_image_url_https'])

            # Only get tweets and relations for unprotected profiles
            if not profile.private:
                app.queue.schedule_posts(profile, recent=True)
                app.queue.schedule_relations(profile)

    return profiles
Exemplo n.º 4
0
def scrape_instagram_account(username, stub=False):
    """ Scrape instagram bio data and create (or update) a profile. """
    # Getting a user ID is more difficult than it ought to be: you need to
    # search for the username and iterate through the search results results to
    # find an exact match.
    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    api_url = 'https://api.instagram.com/v1/users/search'
    params = {'q': username}

    response = requests.get(
        api_url,
        params=params,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    search_results = response.json()
    username_lower = username.lower()
    user_id = None

    for user_result in search_results['data']:
        if user_result['username'].lower() == username_lower:
            user_id = user_result['id']
            break

    if user_id is None:
        raise ScrapeException('Can\'t find Instagram user named {}.'
                              .format(username))

    # Now make another request to get this user's profile data.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']
    profile = Profile('instagram', user_id, data['username'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == user_id) \
                            .one()

    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()