Пример #1
0
def create_caption_dict():
    """
    Create a dictionary with influencer ids, list of posts, and list of captions.

    Args: None

    Output:
        captions (dict): Dcitionary with influencer id, list of posts, and list of captions.s
    """

    client, collection = setup_mongo_client('instascrape', 'test')

    captions = {}
    cursor = collection.find({})
    for x in cursor:
        user_id = x['node']['owner']['id']
        if user_id in captions:
            add_posts_captions(captions, user_id, x)
        else:
            captions[user_id] = {'posts': [], 'caption': []}
            add_posts_captions(captions, user_id, x)

    # Remove profiles that have been deleted since initial scraping
    del captions['4018066784']

    client.close()

    return captions
def followscrape(num_requests):
    """
    Scrape instagram followers

    Args:
        num_requests (int): Number of influencers to be scraped

    Output: None
    """

    client, collection = setup_mongo_client('instascrape', 'followers')

    driver = webdriver.Firefox()
    selenium_instagram_login(driver, 'instagram_credentials.json')

    init_url_search = "https://www.instagram.com/graphql/query/?query_id=17851374694183129&variables={{%22id%22:%22{}%22,%22first%22:20}}"
    base_url_search = "https://www.instagram.com/graphql/query/?query_id=17851374694183129&variables={{%22id%22:%22{}%22,%22first%22:500,%22after%22:%22{}%22}}"

    for i in range(num_requests):
        influencer_id = find_next_influencer('data/ordered_influencers.txt',
                                             'data/scraped_influencers.txt')

        # Initial search for followers
        driver.get(init_url_search.format(influencer_id))
        data_as_json = load_json_from_html(driver)
        insert_data(data_as_json, influencer_id, collection)
        page_info = get_page_info(data_as_json)
        page_counter = 1
        print "Finished scraping {} pages for influencer {}".format(
            page_counter, influencer_id)

        # Keep searching while followers still exist
        while page_info['has_next_page']:
            driver.get(
                base_url_search.format(influencer_id,
                                       str(page_info['end_cursor'])))
            data_as_json = load_json_from_html(driver)
            insert_data(data_as_json, influencer_id, collection)
            page_info = get_page_info(data_as_json)
            page_counter += 1
            print "Finished scraping {} pages for influencer {}".format(
                page_counter, influencer_id)
            time.sleep(np.random.uniform(7, 10))

        write_text(influencer_id, 'data/scraped_influencers.txt')
        time.sleep(np.random.uniform(7, 10))
        print "Finished scraping {} influencers of {}".format(
            i + 1, num_requests)

    client.close()

    print "\n Finished scraping {} influencers' followers".format(num_requests)
    return None
def instascrape(page_info_filepath, num_requests):
    """
    Scrape instagram hashtag search

    Args:
        page_info_filepath (str): Filepath to text file with page_info dicts
        num_requests (int): Number of pages to be scraped

    Action: saves influencer node information to pymongo database

    Output: None
    """

    client, collection = setup_mongo_client('instascrape', 'test')

    page_info = load_last_line(page_info_filepath)

    base_url_search = "https://www.instagram.com/graphql/query/?query_id=17875800862117404&variables={{%22tag_name%22:%22womenwhoclimb%22,%22first%22:{},%22after%22:%22{}%22}}"

    for i in range(num_requests):

        if page_info['has_next_page']:
            response = requests.get(
                base_url_search.format('12', str(page_info['end_cursor'])))

            if response.status_code == 200:
                insert_edge(response, collection)
                page_info = get_page_info(response)
                add_new_line(page_info, page_info_filepath)

            else:
                print "Status Code = " + str(response.status_code)
                return None

        time.sleep(np.random.uniform(15, 45))
        print "Finished scraping {} pages of {}".format(i + 1, num_requests)

    client.close()

    print "\n Finished scraping {} pages of 12 influencers each".format(
        num_requests)
    return None
Пример #4
0
def create_influencer_dict(filepath_json, return_dict=False):
    """
    Create a dictionary with influencer id and posts.

    Args:
        filepath_json (str): Filepath where influencer dictionary will be saved as json.

    Action: Saves influencer dictionary to filepath.
        influencers (dict)
            Keys: id
            Values (dict):
                Keys: 'posts' (list),  'followers' (list)

    Output: None
    """

    client, collection = setup_mongo_client('instascrape', 'test')

    # Retrieve shortcodes and ids from influencers in MongoDB
    shortcodes_ids = []
    cursor = collection.find({})
    for x in cursor:
        shortcodes_ids.append(
            (str(x['node']['shortcode']), str(x['node']['owner']['id'])))

    # Create influencers dictionary
    influencers = {}
    for sc_id_tuple in shortcodes_ids:
        if sc_id_tuple[1] in influencers:
            influencers[sc_id_tuple[1]]['posts'].append(sc_id_tuple[0])
        else:
            influencers[sc_id_tuple[1]] = {'posts': [sc_id_tuple[0]]}

    # Remove profiles that have been deleted since initial scraping
    del influencers['4018066784']

    write_json(influencers, filepath_json)

    client.close()

    if return_dict:
        return influencers
Пример #5
0
def create_hashtag_likes_dict(filepath_json, return_dict=False):
    """
    Create a dictionary with influencer id, hashtags, posts, and
    number of likes for each post.

    Args:
        filepath_json (str): Filepath where dictionary will be saved as json.

    Action: Saves influencer dictionary to filepath.
        influencers (dict)
            Keys: id
            Values (dict):
                Keys: 'hashtags' (list), posts (list),  'likes' (list)

    Output: None
    """

    client, collection = setup_mongo_client('instascrape', 'test')

    influencers = {}
    cursor = collection.find({})
    for x in cursor:
        user_id = x['node']['owner']['id']
        if user_id in influencers:
            add_hashtag_posts_likes(influencers, user_id, x)
        else:
            influencers[user_id] = {'hashtags': [], 'posts': [], 'likes': []}
            add_hashtag_posts_likes(influencers, user_id, x)

    # Remove profiles that have been deleted since initial scraping
    del influencers['4018066784']

    client.close()

    write_json(influencers, filepath_json)

    if return_dict:
        return influencers