def create_caption_dict(): """ Create a dictionary with influencer ids, list of posts, and list of captions. Args: None Output: captions (dict): Dcitionary with influencer id, list of posts, and list of captions.s """ client, collection = setup_mongo_client('instascrape', 'test') captions = {} cursor = collection.find({}) for x in cursor: user_id = x['node']['owner']['id'] if user_id in captions: add_posts_captions(captions, user_id, x) else: captions[user_id] = {'posts': [], 'caption': []} add_posts_captions(captions, user_id, x) # Remove profiles that have been deleted since initial scraping del captions['4018066784'] client.close() return captions
def followscrape(num_requests): """ Scrape instagram followers Args: num_requests (int): Number of influencers to be scraped Output: None """ client, collection = setup_mongo_client('instascrape', 'followers') driver = webdriver.Firefox() selenium_instagram_login(driver, 'instagram_credentials.json') init_url_search = "https://www.instagram.com/graphql/query/?query_id=17851374694183129&variables={{%22id%22:%22{}%22,%22first%22:20}}" base_url_search = "https://www.instagram.com/graphql/query/?query_id=17851374694183129&variables={{%22id%22:%22{}%22,%22first%22:500,%22after%22:%22{}%22}}" for i in range(num_requests): influencer_id = find_next_influencer('data/ordered_influencers.txt', 'data/scraped_influencers.txt') # Initial search for followers driver.get(init_url_search.format(influencer_id)) data_as_json = load_json_from_html(driver) insert_data(data_as_json, influencer_id, collection) page_info = get_page_info(data_as_json) page_counter = 1 print "Finished scraping {} pages for influencer {}".format( page_counter, influencer_id) # Keep searching while followers still exist while page_info['has_next_page']: driver.get( base_url_search.format(influencer_id, str(page_info['end_cursor']))) data_as_json = load_json_from_html(driver) insert_data(data_as_json, influencer_id, collection) page_info = get_page_info(data_as_json) page_counter += 1 print "Finished scraping {} pages for influencer {}".format( page_counter, influencer_id) time.sleep(np.random.uniform(7, 10)) write_text(influencer_id, 'data/scraped_influencers.txt') time.sleep(np.random.uniform(7, 10)) print "Finished scraping {} influencers of {}".format( i + 1, num_requests) client.close() print "\n Finished scraping {} influencers' followers".format(num_requests) return None
def instascrape(page_info_filepath, num_requests): """ Scrape instagram hashtag search Args: page_info_filepath (str): Filepath to text file with page_info dicts num_requests (int): Number of pages to be scraped Action: saves influencer node information to pymongo database Output: None """ client, collection = setup_mongo_client('instascrape', 'test') page_info = load_last_line(page_info_filepath) base_url_search = "https://www.instagram.com/graphql/query/?query_id=17875800862117404&variables={{%22tag_name%22:%22womenwhoclimb%22,%22first%22:{},%22after%22:%22{}%22}}" for i in range(num_requests): if page_info['has_next_page']: response = requests.get( base_url_search.format('12', str(page_info['end_cursor']))) if response.status_code == 200: insert_edge(response, collection) page_info = get_page_info(response) add_new_line(page_info, page_info_filepath) else: print "Status Code = " + str(response.status_code) return None time.sleep(np.random.uniform(15, 45)) print "Finished scraping {} pages of {}".format(i + 1, num_requests) client.close() print "\n Finished scraping {} pages of 12 influencers each".format( num_requests) return None
def create_influencer_dict(filepath_json, return_dict=False): """ Create a dictionary with influencer id and posts. Args: filepath_json (str): Filepath where influencer dictionary will be saved as json. Action: Saves influencer dictionary to filepath. influencers (dict) Keys: id Values (dict): Keys: 'posts' (list), 'followers' (list) Output: None """ client, collection = setup_mongo_client('instascrape', 'test') # Retrieve shortcodes and ids from influencers in MongoDB shortcodes_ids = [] cursor = collection.find({}) for x in cursor: shortcodes_ids.append( (str(x['node']['shortcode']), str(x['node']['owner']['id']))) # Create influencers dictionary influencers = {} for sc_id_tuple in shortcodes_ids: if sc_id_tuple[1] in influencers: influencers[sc_id_tuple[1]]['posts'].append(sc_id_tuple[0]) else: influencers[sc_id_tuple[1]] = {'posts': [sc_id_tuple[0]]} # Remove profiles that have been deleted since initial scraping del influencers['4018066784'] write_json(influencers, filepath_json) client.close() if return_dict: return influencers
def create_hashtag_likes_dict(filepath_json, return_dict=False): """ Create a dictionary with influencer id, hashtags, posts, and number of likes for each post. Args: filepath_json (str): Filepath where dictionary will be saved as json. Action: Saves influencer dictionary to filepath. influencers (dict) Keys: id Values (dict): Keys: 'hashtags' (list), posts (list), 'likes' (list) Output: None """ client, collection = setup_mongo_client('instascrape', 'test') influencers = {} cursor = collection.find({}) for x in cursor: user_id = x['node']['owner']['id'] if user_id in influencers: add_hashtag_posts_likes(influencers, user_id, x) else: influencers[user_id] = {'hashtags': [], 'posts': [], 'likes': []} add_hashtag_posts_likes(influencers, user_id, x) # Remove profiles that have been deleted since initial scraping del influencers['4018066784'] client.close() write_json(influencers, filepath_json) if return_dict: return influencers