Пример #1
0
def scrapePageSearch(access_token):
    all_groups = []
    with open('data/{}_groups.txt'.format(query_title), 'w') as file:
        has_next_page = True
        num_processed = 0
        scrape_starttime = datetime.datetime.now()

        print("Scraping groups relevant to {}: {}\n".format(
            query_space, scrape_starttime))
        url = getGroupSearchUrl()

        while has_next_page and url is not None:
            raw = dict(json.loads(request_until_succeed(url)))
            groups = [{
                "id": group['id'],
                "name": group['name']
            } for group in raw['data']]
            all_groups += groups

            # if there is no next page, we're done.
            if 'paging' in raw:
                url = raw['paging']['next']
            else:
                has_next_page = False

        file.write(json.dumps({"data": all_groups}))
def getFacebookPageFeedData(page_id, access_token, num_statuses):

    # Construct the URL string; see http://stackoverflow.com/a/37239851 for
    # Reactions parameters
    base = "https://graph.facebook.com/v2.9"
    node = "/%s/posts" % page_id
    fields = "/?fields=message,link,created_time,type,name,id," + \
            "comments.limit(0).summary(true),shares,reactions" + \
            ".limit(0).summary(true)"
    parameters = "&limit=%s&access_token=%s" % (num_statuses, access_token)
    url = base + node + fields + parameters

    # retrieve data
    data = json.loads(request_until_succeed(url))

    return data
Пример #3
0
def getFacebookCommentFeedData(status_id, access_token, num_comments):

    # Construct the URL string
    base = "https://graph.facebook.com/v2.9"
    node = "/%s/comments" % status_id
    fields = "?fields=id,message,like_count,created_time,comments,from,attachment"
    parameters = "&order=chronological&limit=%s&access_token=%s" % \
            (num_comments, access_token)
    url = base + node + fields + parameters

    # retrieve data
    data = request_until_succeed(url)
    if data is None:
        return None
    else:
        return json.loads(data)
Пример #4
0
def scrapeFacebookPageFeedStatus(group_id, access_token):
    with open_csv_w('../output/%s_facebook_statuses.csv' % group_id) as file:
        w = csv.writer(file)
        w.writerow([
            "status_id", "status_message", "status_author", "link_name",
            "status_type", "status_link", "status_published", "num_reactions",
            "num_comments", "num_shares", "num_likes", "num_loves", "num_wows",
            "num_hahas", "num_sads", "num_angrys"
        ])

        has_next_page = True
        num_processed = 0  # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()

        print("Scraping %s Facebook Page: %s\n" % \
                (group_id, scrape_starttime))

        statuses = getFacebookPageFeedData(group_id, access_token, 100)

        while has_next_page:
            for status in statuses['data']:

                # Ensure it is a status with the expected metadata
                if 'reactions' in status:
                    w.writerow(processFacebookPageFeedStatus(status, \
                                                            access_token))

                # output progress occasionally to make sure code is not
                # stalling
                num_processed += 1
                if num_processed % 100 == 0:
                    print("%s Statuses Processed: %s") % (
                        num_processed, datetime.datetime.now())

            # if there is no next page, we're done.
            if 'paging' in statuses.keys():
                statuses = json.loads(request_until_succeed(\
                        statuses['paging']['next']))
            else:
                has_next_page = False


        print("\nDone!\n%s Statuses Processed in %s") % \
                (num_processed, datetime.datetime.now() - scrape_starttime)
def getReactionsForStatus(status_id, access_token):

    # See http://stackoverflow.com/a/37239851 for Reactions parameters
        # Reactions are only accessable at a single-post endpoint

    base = "https://graph.facebook.com/v2.9"
    node = "/%s" % status_id
    reactions = "/?fields=" \
            "reactions.type(LIKE).limit(0).summary(total_count).as(like)" \
            ",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \
            ",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \
            ",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \
            ",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \
            ",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)"
    parameters = "&access_token=%s" % access_token
    url = base + node + reactions + parameters

    # retrieve data
    data = json.loads(request_until_succeed(url))

    return data
Пример #6
0
def scrapeFacebookPageFeedComments(page_id, access_token):
    # with open('%s_facebook_comments.csv' % file_id, 'wb') as file:
    with open_csv_w('../output/%s_facebook_comments.csv' % file_id) as file:
        w = csv.writer(file)
        w.writerow([
            "comment_id", "status_id", "parent_id", "comment_message",
            "comment_author", "comment_published", "comment_likes"
        ])

        num_processed = 0  # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()

        print("Scraping %s Comments From Posts: %s\n" % \
                (file_id, scrape_starttime))

        # with open('%s_facebook_statuses.csv' % file_id, 'rb') as csvfile:
        with open_csv_w('../output/%s_facebook_statuses.csv' % file_id,
                        'rb') as csvfile:
            reader = csv.DictReader(csvfile)

            #reader = [dict(status_id='759985267390294_1158001970921953')]

            for status in reader:
                has_next_page = True

                comments = getFacebookCommentFeedData(status['status_id'],
                                                      access_token, 100)

                while has_next_page and comments is not None:
                    for comment in comments['data']:
                        w.writerow(
                            processFacebookComment(comment,
                                                   status['status_id']))

                        if 'comments' in comment:
                            has_next_subpage = True

                            subcomments = getFacebookCommentFeedData(
                                comment['id'], access_token, 100)

                            while has_next_subpage:
                                for subcomment in subcomments['data']:
                                    # print (processFacebookComment(
                                    # subcomment, status['status_id'],
                                    # comment['id']))
                                    w.writerow(
                                        processFacebookComment(
                                            subcomment, status['status_id'],
                                            comment['id']))

                                    num_processed += 1
                                    if num_processed % 1000 == 0:
                                        print("%s Comments Processed: %s" % \
                                                (num_processed,
                                                    datetime.datetime.now()))

                                if 'paging' in subcomments:
                                    if 'next' in subcomments['paging']:
                                        subcomments = json.loads(
                                                request_until_succeed(
                                                    subcomments['paging']\
                                                               ['next']))
                                    else:
                                        has_next_subpage = False
                                else:
                                    has_next_subpage = False

                        # output progress occasionally to make sure code is not
                        # stalling
                        num_processed += 1
                        if num_processed % 1000 == 0:
                            print("%s Comments Processed: %s" % \
                                    (num_processed, datetime.datetime.now()))

                    if 'paging' in comments:
                        if 'next' in comments['paging']:
                            comments = json.loads(
                                request_until_succeed(
                                    comments['paging']['next']))
                        else:
                            has_next_page = False
                    else:
                        has_next_page = False


        print("\nDone!\n%s Comments Processed in %s" % \
                (num_processed, datetime.datetime.now() - scrape_starttime))