def scrape_instagram_account_by_id(upstream_id, stub=False): """ Scrape instagram bio data for upstream ID and update a profile. """ db_session = worker.get_session() proxies = _get_proxies(db_session) # Instagram API request. api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] # Update the profile. data = response.json()[0] profile = Profile('instagram', upstream_id, data['screen_name']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site == 'instagram') \ .filter(Profile.upstream_id == upstream_id) \ .one() # Update profile profile.last_update = datetime.now() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = stub db_session.commit() # Schedule followup jobs. app.queue.schedule_index_profile(profile) # index all profiles, inc stubs if not stub: app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()
def scrape_twitter_account(username): ''' Scrape twitter bio data and create (or update) a profile. TODO The API call used here supports up to 100 usernames at a time. We could easily modify this function to populate many profiles at once. ''' # Request from Twitter API. db_session = worker.get_session() api_url = 'https://api.twitter.com/1.1/users/lookup.json' params = {'screen_name': username} response = requests.get( api_url, params=params, proxies=_get_proxies(db_session), verify=False ) response.raise_for_status() # Get Twitter ID and upsert the profile. data = response.json()[0] # TODO Only supports getting 1 profile right now... user_id = data['id_str'] profile = Profile('twitter', user_id, data['screen_name']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site=='twitter') \ .filter(Profile.upstream_id==user_id) \ .one() _twitter_populate_profile(data, profile) profile.is_stub = False db_session.commit() # Schedule followup jobs. app.queue.schedule_avatar(profile, data['profile_image_url_https']) app.queue.schedule_index_profile(profile) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()
def scrape_twitter_account_by_id(upstream_ids, stub=False, labels={}): """ Scrape twitter bio data for upstream IDs and/or updates a profile. Accepts twitter ID rather than username. """ if len(upstream_ids) > 100: raise ScrapeException('Twitter API max is 100 user IDs per request.') db_session = worker.get_session() profiles = [] # Request from Twitter API. api_url = 'https://api.twitter.com/1.1/users/lookup.json' payload = {'user_id': ','.join(upstream_ids)} headers = {'ACCEPT-ENCODING': None} response = requests.post( api_url, data=payload, proxies=_get_proxies(db_session), verify=False, headers=TWITTER_HEADERS ) response.raise_for_status() # Update the profile. for profile_json in response.json(): profile = Profile( 'twitter', profile_json['id_str'], profile_json['screen_name'] ) profile.is_stub = stub profile.private = profile_json['protected'] db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site=='twitter') \ .filter( Profile.upstream_id==profile_json['id_str'] )\ .one() # Profiles already in the system are either not stubs or # being updated to full profiles profile.is_stub = False _twitter_populate_profile(profile_json, profile) if profile.upstream_id in labels: _label_profile(db_session, profile, labels[profile.upstream_id]) profile.last_update = datetime.now() db_session.commit() profiles.append(profile.as_dict()) # Schedule followup jobs. app.queue.schedule_index_profile(profile) if not stub: app.queue.schedule_avatar( profile, profile_json['profile_image_url_https'] ) # Only get tweets and relations for unprotected profiles if not profile.private: app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profiles
def scrape_twitter_account(usernames, stub=False, labels=None): """ Scrape twitter bio data and create (or update) a list of profile usernames. Keyword arguments: stub -- add the profile in stub mode (default False) labels -- dictionary of username labels (default None) """ if len(usernames) > 100: raise ScrapeException('Twitter API max is 100 user IDs per request.') profiles = [] # Request from Twitter API. db_session = worker.get_session() api_url = 'https://api.twitter.com/1.1/users/lookup.json' payload = {'screen_name': ','.join(usernames)} headers = {'ACCEPT-ENCODING': None} response = requests.post( api_url, data=payload, proxies=_get_proxies(db_session), verify=False, headers=TWITTER_HEADERS ) response.raise_for_status() # Get Twitter ID and upsert the profile. for profile_json in response.json(): user_id = profile_json['id_str'] profile = Profile('twitter', user_id, profile_json['screen_name']) profile.is_stub = stub profile.private = profile_json['protected'] db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site=='twitter') \ .filter(Profile.upstream_id==user_id) \ .one() # Profiles already in the system are either not stubs or # being updated to full profiles profile.is_stub = False _twitter_populate_profile(profile_json, profile) if profile.username.lower() in labels: print('Labels: {}'.format(labels), flush=True) _label_profile(db_session, profile, labels[profile.username.lower()]) profile.last_update = datetime.now() db_session.commit() profiles.append(profile.as_dict()) # Schedule followup jobs. app.queue.schedule_index_profile(profile) if not stub: app.queue.schedule_avatar(profile, profile_json['profile_image_url_https']) # Only get tweets and relations for unprotected profiles if not profile.private: app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profiles
def scrape_instagram_account(username, stub=False): """ Scrape instagram bio data and create (or update) a profile. """ # Getting a user ID is more difficult than it ought to be: you need to # search for the username and iterate through the search results results to # find an exact match. db_session = worker.get_session() proxies = _get_proxies(db_session) api_url = 'https://api.instagram.com/v1/users/search' params = {'q': username} response = requests.get( api_url, params=params, proxies=proxies, verify=False ) response.raise_for_status() search_results = response.json() username_lower = username.lower() user_id = None for user_result in search_results['data']: if user_result['username'].lower() == username_lower: user_id = user_result['id'] break if user_id is None: raise ScrapeException('Can\'t find Instagram user named {}.' .format(username)) # Now make another request to get this user's profile data. api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] profile = Profile('instagram', user_id, data['username']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site == 'instagram') \ .filter(Profile.upstream_id == user_id) \ .one() profile.last_update = datetime.now() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = stub db_session.commit() # Schedule followup jobs. app.queue.schedule_index_profile(profile) # index all profiles, inc stubs if not stub: app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()
def scrape_instagram_account(username): ''' Scrape instagram bio data and create (or update) a profile. ''' # Getting a user ID is more difficult than it ought to be: you need to # search for the username and iterate through the search results results to # find an exact match. db_session = worker.get_session() proxies = _get_proxies(db_session) api_url = 'https://api.instagram.com/v1/users/search' params = {'q': username} response = requests.get( api_url, params=params, proxies=proxies, verify=False ) response.raise_for_status() search_results = response.json() username_lower = username.lower() user_id = None for user_result in search_results['data']: if user_result['username'].lower() == username_lower: user_id = user_result['id'] break if user_id is None: raise ScrapeException('Can\'t find Instagram user named {}.' .format(username)) # Now make another request to get this user's profile data. api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] profile = Profile('instagram', user_id, data['username']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==user_id) \ .one() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = False db_session.commit() # Schedule followup jobs. app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_index_profile(profile) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()