示例#1
0
def scrape_instagram_account_by_id(upstream_id, stub=False):
    """ Scrape instagram bio data for upstream ID and update a profile. """

    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    # Instagram API request.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']

    # Update the profile.
    data = response.json()[0]
    profile = Profile('instagram', upstream_id, data['screen_name'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == upstream_id) \
                            .one()

    # Update profile
    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()
示例#2
0
def scrape_instagram_relations(id_):
    """
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
示例#3
0
def scrape_instagram_relations(id_):
    '''
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    #max_results = _get_max_relations(db)['instagram']
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
示例#4
0
def scrape_instagram_account(username, stub=False):
    """ Scrape instagram bio data and create (or update) a profile. """
    # Getting a user ID is more difficult than it ought to be: you need to
    # search for the username and iterate through the search results results to
    # find an exact match.
    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    api_url = 'https://api.instagram.com/v1/users/search'
    params = {'q': username}

    response = requests.get(
        api_url,
        params=params,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    search_results = response.json()
    username_lower = username.lower()
    user_id = None

    for user_result in search_results['data']:
        if user_result['username'].lower() == username_lower:
            user_id = user_result['id']
            break

    if user_id is None:
        raise ScrapeException('Can\'t find Instagram user named {}.'
                              .format(username))

    # Now make another request to get this user's profile data.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']
    profile = Profile('instagram', user_id, data['username'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == user_id) \
                            .one()

    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()
示例#5
0
def scrape_instagram_account(username):
    ''' Scrape instagram bio data and create (or update) a profile. '''

    # Getting a user ID is more difficult than it ought to be: you need to
    # search for the username and iterate through the search results results to
    # find an exact match.
    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    api_url = 'https://api.instagram.com/v1/users/search'
    params = {'q': username}

    response = requests.get(
        api_url,
        params=params,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    search_results = response.json()
    username_lower = username.lower()
    user_id = None

    for user_result in search_results['data']:
        if user_result['username'].lower() == username_lower:
            user_id = user_result['id']
            break

    if user_id is None:
        raise ScrapeException('Can\'t find Instagram user named {}.'
                              .format(username))

    # Now make another request to get this user's profile data.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']
    profile = Profile('instagram', user_id, data['username'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==user_id) \
                            .one()

    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = False
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_avatar(profile, data['profile_picture'])
    app.queue.schedule_index_profile(profile)
    app.queue.schedule_posts(profile, recent=True)
    app.queue.schedule_relations(profile)

    return profile.as_dict()