Пример #1
0
def scrape_instagram_account_by_id(upstream_id, stub=False):
    """ Scrape instagram bio data for upstream ID and update a profile. """

    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    # Instagram API request.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']

    # Update the profile.
    data = response.json()[0]
    profile = Profile('instagram', upstream_id, data['screen_name'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == upstream_id) \
                            .one()

    # Update profile
    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()
Пример #2
0
    def _create_sample_profiles(self, config):
        ''' Create some sample profiles. '''

        session = app.database.get_session(self._db)
        sample_dir = os.path.join(os.path.dirname(__file__), 'sample-data')

        # Maurice Moss
        moss_twitter = Profile(site='twitter',
                               upstream_id='12345',
                               username=ProfileUsername(
                                   'maurice.moss', start_date='2014-04-01'))

        moss_twitter.usernames.append(
            ProfileUsername('maurice',
                            start_date='2013-06-01',
                            end_date='2014-03-31'))

        moss_twitter.usernames.append(
            ProfileUsername('maurie',
                            start_date='2013-02-15',
                            end_date='2013-05-30'))

        Post(author=moss_twitter,
             content='Going to the grocery store.',
             upstream_id='1234',
             upstream_created='2015-02-04 12:34:50')

        post = Post(author=moss_twitter,
                    content='Love this band!.',
                    upstream_id='2345',
                    upstream_created='2015-03-01')

        post.attachments.append(
            File(name='helloworld.txt',
                 mime='text/plain',
                 content='Hello world!\n\n'.encode('utf8')))

        moss_twitter.posts.append(post)

        with open(os.path.join(sample_dir, 'moss.jpg'), 'rb') as moss_jpg:
            moss_twitter.avatars.append(
                Avatar(url='http://foobar.com/moss-avatar.jpg',
                       mime='image/jpeg',
                       image=moss_jpg.read()))

        moss_twitter.description = "I do IT at Reynholm Industries."
        moss_twitter.post_count = 1205
        moss_twitter.friend_count = 1
        moss_twitter.follower_count = 3
        moss_twitter.join_date = dateutil.parser.parse('2013-06-01')
        moss_twitter.join_date_is_exact = False

        session.add(moss_twitter)

        # Jen Barber
        jen_twitter = Profile(site='twitter',
                              upstream_id='23456',
                              username=ProfileUsername(
                                  'jen.barber', start_date='2013-11-12'))

        jen_twitter.usernames.append(
            ProfileUsername('jenb',
                            start_date='2013-06-14',
                            end_date='2013-11-12'))

        jen_twitter.usernames.append(
            ProfileUsername('jenny',
                            start_date='2013-03-15',
                            end_date='2013-06-14'))

        with open(os.path.join(sample_dir, 'jen.jpg'), 'rb') as jen_jpg:
            jen_twitter.avatars.append(
                Avatar(url='http://foobar.com/jen-avatar.jpg',
                       mime='image/jpeg',
                       image=jen_jpg.read()))

        jen_twitter.description = "Relationship Manager for the IT department."
        jen_twitter.post_count = 1543
        jen_twitter.friend_count = 1
        jen_twitter.follower_count = 1
        jen_twitter.join_date = dateutil.parser.parse('2013-03-15')
        jen_twitter.join_date_is_exact = True

        moss_twitter.followers.append(jen_twitter)

        session.add(jen_twitter)

        # A couple of randos.
        moss_twitter.followers.append(
            Profile(site='twitter', upstream_id='345678', username='******'))

        moss_twitter.followers.append(
            Profile(site='twitter', upstream_id='456789', username='******'))

        jen_twitter.followers.append(
            Profile(site='twitter', upstream_id='567890', username='******'))

        session.commit()
Пример #3
0
    def _create_sample_profiles(self, config):
        ''' Create some sample profiles. '''

        session = app.database.get_session(self._db)
        sample_dir = os.path.join(os.path.dirname(__file__), 'sample-data')

        # Maurice Moss
        moss_twitter = Profile(
            site='twitter',
            upstream_id='12345',
            username=ProfileUsername('maurice.moss', start_date='2014-04-01')
        )

        moss_twitter.usernames.append(ProfileUsername(
            'maurice',
            start_date='2013-06-01',
            end_date='2014-03-31'
        ))

        moss_twitter.usernames.append(ProfileUsername(
            'maurie',
            start_date='2013-02-15',
            end_date='2013-05-30'
        ))

        Post(
            author=moss_twitter,
            content='Going to the grocery store.',
            upstream_id='1234',
            upstream_created='2015-02-04 12:34:50'
        )

        post = Post(
            author=moss_twitter,
            content='Love this band!.',
            upstream_id='2345',
            upstream_created='2015-03-01'
        )

        post.attachments.append(File(
            name='helloworld.txt',
            mime='text/plain',
            content='Hello world!\n\n'.encode('utf8')
        ))

        moss_twitter.posts.append(post)

        with open(os.path.join(sample_dir, 'moss.jpg'), 'rb') as moss_jpg:
            moss_twitter.avatars.append(Avatar(
                url='http://foobar.com/moss-avatar.jpg',
                mime='image/jpeg',
                image=moss_jpg.read()
            ))

        moss_twitter.description = "I do IT at Reynholm Industries."
        moss_twitter.post_count = 1205
        moss_twitter.friend_count = 1
        moss_twitter.follower_count = 3
        moss_twitter.join_date = dateutil.parser.parse('2013-06-01')
        moss_twitter.join_date_is_exact = False

        session.add(moss_twitter)

        # Jen Barber
        jen_twitter = Profile(
            site='twitter',
            upstream_id='23456',
            username=ProfileUsername('jen.barber', start_date='2013-11-12')
        )

        jen_twitter.usernames.append(ProfileUsername(
            'jenb',
            start_date='2013-06-14',
            end_date='2013-11-12'
        ))

        jen_twitter.usernames.append(ProfileUsername(
            'jenny',
            start_date='2013-03-15',
            end_date='2013-06-14'
        ))

        with open(os.path.join(sample_dir, 'jen.jpg'), 'rb') as jen_jpg:
            jen_twitter.avatars.append(Avatar(
                url='http://foobar.com/jen-avatar.jpg',
                mime='image/jpeg',
                image=jen_jpg.read()
            ))

        jen_twitter.description = "Relationship Manager for the IT department."
        jen_twitter.post_count = 1543
        jen_twitter.friend_count = 1
        jen_twitter.follower_count = 1
        jen_twitter.join_date = dateutil.parser.parse('2013-03-15')
        jen_twitter.join_date_is_exact = True

        moss_twitter.followers.append(jen_twitter)

        session.add(jen_twitter)

        # A couple of randos.
        moss_twitter.followers.append(Profile(
            site='twitter',
            upstream_id='345678',
            username='******'
        ))

        moss_twitter.followers.append(Profile(
            site='twitter',
            upstream_id='456789',
            username='******'
        ))

        jen_twitter.followers.append(Profile(
            site='twitter',
            upstream_id='567890',
            username='******'
        ))

        session.commit()
Пример #4
0
def scrape_instagram_account(username, stub=False):
    """ Scrape instagram bio data and create (or update) a profile. """
    # Getting a user ID is more difficult than it ought to be: you need to
    # search for the username and iterate through the search results results to
    # find an exact match.
    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    api_url = 'https://api.instagram.com/v1/users/search'
    params = {'q': username}

    response = requests.get(
        api_url,
        params=params,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    search_results = response.json()
    username_lower = username.lower()
    user_id = None

    for user_result in search_results['data']:
        if user_result['username'].lower() == username_lower:
            user_id = user_result['id']
            break

    if user_id is None:
        raise ScrapeException('Can\'t find Instagram user named {}.'
                              .format(username))

    # Now make another request to get this user's profile data.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']
    profile = Profile('instagram', user_id, data['username'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == user_id) \
                            .one()

    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()
Пример #5
0
def scrape_instagram_account(username):
    ''' Scrape instagram bio data and create (or update) a profile. '''

    # Getting a user ID is more difficult than it ought to be: you need to
    # search for the username and iterate through the search results results to
    # find an exact match.
    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    api_url = 'https://api.instagram.com/v1/users/search'
    params = {'q': username}

    response = requests.get(
        api_url,
        params=params,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    search_results = response.json()
    username_lower = username.lower()
    user_id = None

    for user_result in search_results['data']:
        if user_result['username'].lower() == username_lower:
            user_id = user_result['id']
            break

    if user_id is None:
        raise ScrapeException('Can\'t find Instagram user named {}.'
                              .format(username))

    # Now make another request to get this user's profile data.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']
    profile = Profile('instagram', user_id, data['username'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==user_id) \
                            .one()

    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = False
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_avatar(profile, data['profile_picture'])
    app.queue.schedule_index_profile(profile)
    app.queue.schedule_posts(profile, recent=True)
    app.queue.schedule_relations(profile)

    return profile.as_dict()