def scrape_instagram_account_by_id(upstream_id, stub=False): """ Scrape instagram bio data for upstream ID and update a profile. """ db_session = worker.get_session() proxies = _get_proxies(db_session) # Instagram API request. api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] # Update the profile. data = response.json()[0] profile = Profile('instagram', upstream_id, data['screen_name']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site == 'instagram') \ .filter(Profile.upstream_id == upstream_id) \ .one() # Update profile profile.last_update = datetime.now() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = stub db_session.commit() # Schedule followup jobs. app.queue.schedule_index_profile(profile) # index all profiles, inc stubs if not stub: app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()
def scrape_instagram_relations(id_): """ Fetch friends and followers for the Instagram user identified by `id_`. The number of friends and followers to fetch is configured in Admin. """ redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) friends_results = 0 followers_results = 0 max_results = get_config(db, 'max_relations_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_instagram must be an integer' ) friends_params = {} followers_params = {} total_results = max_results*2 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] worker.start_job(total=total_results) # Get friend IDs. friends_url = 'https://api.instagram.com/v1/users/{}/follows' \ .format(profile.upstream_id) while friends_results < max_results: # Get friends from Instagram API friends_response = requests.get( friends_url, params=friends_params, proxies=proxies, verify=False ) friends_response.raise_for_status() pagination = friends_response.json()['pagination'] for friend in friends_response.json()['data']: # Only store friends that are not already in db. if friend['id'] not in current_friends_ids: related_profile = Profile( 'instagram', friend['id'], friend['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==friend['id']) \ .one() related_profile.name = friend['full_name'] profile.friends.append(related_profile) friends_results += 1 worker.update_job(current=friends_results) if friends_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: friends_params['cursor'] = pagination['next_cursor'] else: break # No more results # Get follower IDs. followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \ .format(profile.upstream_id) # Get followers from Instagram API while followers_results < max_results: # Get friends from Instagram API followers_response = requests.get( followers_url, params=followers_params, proxies=proxies, verify=False ) followers_response.raise_for_status() pagination = followers_response.json()['pagination'] for follower in followers_response.json()['data']: # Only store followers that are not already in db. if follower['id'] not in current_followers_ids: related_profile = Profile( 'instagram', follower['id'], follower['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==follower['id']) \ .one() related_profile.name = follower['full_name'] profile.followers.append(related_profile) followers_results += 1 worker.update_job(current=friends_results + followers_results) if followers_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: followers_params['cursor'] = pagination['next_cursor'] else: break # No more results worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_instagram_relations(id_): ''' Fetch friends and followers for the Instagram user identified by `id_`. The number of friends and followers to fetch is configured in Admin. ''' redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) friends_results = 0 followers_results = 0 #max_results = _get_max_relations(db)['instagram'] max_results = get_config(db, 'max_relations_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_instagram must be an integer' ) friends_params = {} followers_params = {} total_results = max_results*2 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] worker.start_job(total=total_results) # Get friend IDs. friends_url = 'https://api.instagram.com/v1/users/{}/follows' \ .format(profile.upstream_id) while friends_results < max_results: # Get friends from Instagram API friends_response = requests.get( friends_url, params=friends_params, proxies=proxies, verify=False ) friends_response.raise_for_status() pagination = friends_response.json()['pagination'] for friend in friends_response.json()['data']: # Only store friends that are not already in db. if friend['id'] not in current_friends_ids: related_profile = Profile( 'instagram', friend['id'], friend['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==friend['id']) \ .one() related_profile.name = friend['full_name'] profile.friends.append(related_profile) friends_results += 1 worker.update_job(current=friends_results) if friends_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: friends_params['cursor'] = pagination['next_cursor'] else: break # No more results # Get follower IDs. followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \ .format(profile.upstream_id) # Get followers from Instagram API while followers_results < max_results: # Get friends from Instagram API followers_response = requests.get( followers_url, params=followers_params, proxies=proxies, verify=False ) followers_response.raise_for_status() pagination = followers_response.json()['pagination'] for follower in followers_response.json()['data']: # Only store followers that are not already in db. if follower['id'] not in current_followers_ids: related_profile = Profile( 'instagram', follower['id'], follower['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==follower['id']) \ .one() related_profile.name = follower['full_name'] profile.followers.append(related_profile) followers_results += 1 worker.update_job(current=friends_results + followers_results) if followers_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: followers_params['cursor'] = pagination['next_cursor'] else: break # No more results worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_instagram_account(username, stub=False): """ Scrape instagram bio data and create (or update) a profile. """ # Getting a user ID is more difficult than it ought to be: you need to # search for the username and iterate through the search results results to # find an exact match. db_session = worker.get_session() proxies = _get_proxies(db_session) api_url = 'https://api.instagram.com/v1/users/search' params = {'q': username} response = requests.get( api_url, params=params, proxies=proxies, verify=False ) response.raise_for_status() search_results = response.json() username_lower = username.lower() user_id = None for user_result in search_results['data']: if user_result['username'].lower() == username_lower: user_id = user_result['id'] break if user_id is None: raise ScrapeException('Can\'t find Instagram user named {}.' .format(username)) # Now make another request to get this user's profile data. api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] profile = Profile('instagram', user_id, data['username']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site == 'instagram') \ .filter(Profile.upstream_id == user_id) \ .one() profile.last_update = datetime.now() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = stub db_session.commit() # Schedule followup jobs. app.queue.schedule_index_profile(profile) # index all profiles, inc stubs if not stub: app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()
def scrape_instagram_account(username): ''' Scrape instagram bio data and create (or update) a profile. ''' # Getting a user ID is more difficult than it ought to be: you need to # search for the username and iterate through the search results results to # find an exact match. db_session = worker.get_session() proxies = _get_proxies(db_session) api_url = 'https://api.instagram.com/v1/users/search' params = {'q': username} response = requests.get( api_url, params=params, proxies=proxies, verify=False ) response.raise_for_status() search_results = response.json() username_lower = username.lower() user_id = None for user_result in search_results['data']: if user_result['username'].lower() == username_lower: user_id = user_result['id'] break if user_id is None: raise ScrapeException('Can\'t find Instagram user named {}.' .format(username)) # Now make another request to get this user's profile data. api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] profile = Profile('instagram', user_id, data['username']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==user_id) \ .one() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = False db_session.commit() # Schedule followup jobs. app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_index_profile(profile) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()