def scrapePageSearch(access_token): all_groups = [] with open('data/{}_groups.txt'.format(query_title), 'w') as file: has_next_page = True num_processed = 0 scrape_starttime = datetime.datetime.now() print("Scraping groups relevant to {}: {}\n".format( query_space, scrape_starttime)) url = getGroupSearchUrl() while has_next_page and url is not None: raw = dict(json.loads(request_until_succeed(url))) groups = [{ "id": group['id'], "name": group['name'] } for group in raw['data']] all_groups += groups # if there is no next page, we're done. if 'paging' in raw: url = raw['paging']['next'] else: has_next_page = False file.write(json.dumps({"data": all_groups}))
def getFacebookPageFeedData(page_id, access_token, num_statuses): # Construct the URL string; see http://stackoverflow.com/a/37239851 for # Reactions parameters base = "https://graph.facebook.com/v2.9" node = "/%s/posts" % page_id fields = "/?fields=message,link,created_time,type,name,id," + \ "comments.limit(0).summary(true),shares,reactions" + \ ".limit(0).summary(true)" parameters = "&limit=%s&access_token=%s" % (num_statuses, access_token) url = base + node + fields + parameters # retrieve data data = json.loads(request_until_succeed(url)) return data
def getFacebookCommentFeedData(status_id, access_token, num_comments): # Construct the URL string base = "https://graph.facebook.com/v2.9" node = "/%s/comments" % status_id fields = "?fields=id,message,like_count,created_time,comments,from,attachment" parameters = "&order=chronological&limit=%s&access_token=%s" % \ (num_comments, access_token) url = base + node + fields + parameters # retrieve data data = request_until_succeed(url) if data is None: return None else: return json.loads(data)
def scrapeFacebookPageFeedStatus(group_id, access_token): with open_csv_w('../output/%s_facebook_statuses.csv' % group_id) as file: w = csv.writer(file) w.writerow([ "status_id", "status_message", "status_author", "link_name", "status_type", "status_link", "status_published", "num_reactions", "num_comments", "num_shares", "num_likes", "num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys" ]) has_next_page = True num_processed = 0 # keep a count on how many we've processed scrape_starttime = datetime.datetime.now() print("Scraping %s Facebook Page: %s\n" % \ (group_id, scrape_starttime)) statuses = getFacebookPageFeedData(group_id, access_token, 100) while has_next_page: for status in statuses['data']: # Ensure it is a status with the expected metadata if 'reactions' in status: w.writerow(processFacebookPageFeedStatus(status, \ access_token)) # output progress occasionally to make sure code is not # stalling num_processed += 1 if num_processed % 100 == 0: print("%s Statuses Processed: %s") % ( num_processed, datetime.datetime.now()) # if there is no next page, we're done. if 'paging' in statuses.keys(): statuses = json.loads(request_until_succeed(\ statuses['paging']['next'])) else: has_next_page = False print("\nDone!\n%s Statuses Processed in %s") % \ (num_processed, datetime.datetime.now() - scrape_starttime)
def getReactionsForStatus(status_id, access_token): # See http://stackoverflow.com/a/37239851 for Reactions parameters # Reactions are only accessable at a single-post endpoint base = "https://graph.facebook.com/v2.9" node = "/%s" % status_id reactions = "/?fields=" \ "reactions.type(LIKE).limit(0).summary(total_count).as(like)" \ ",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \ ",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \ ",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \ ",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \ ",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)" parameters = "&access_token=%s" % access_token url = base + node + reactions + parameters # retrieve data data = json.loads(request_until_succeed(url)) return data
def scrapeFacebookPageFeedComments(page_id, access_token): # with open('%s_facebook_comments.csv' % file_id, 'wb') as file: with open_csv_w('../output/%s_facebook_comments.csv' % file_id) as file: w = csv.writer(file) w.writerow([ "comment_id", "status_id", "parent_id", "comment_message", "comment_author", "comment_published", "comment_likes" ]) num_processed = 0 # keep a count on how many we've processed scrape_starttime = datetime.datetime.now() print("Scraping %s Comments From Posts: %s\n" % \ (file_id, scrape_starttime)) # with open('%s_facebook_statuses.csv' % file_id, 'rb') as csvfile: with open_csv_w('../output/%s_facebook_statuses.csv' % file_id, 'rb') as csvfile: reader = csv.DictReader(csvfile) #reader = [dict(status_id='759985267390294_1158001970921953')] for status in reader: has_next_page = True comments = getFacebookCommentFeedData(status['status_id'], access_token, 100) while has_next_page and comments is not None: for comment in comments['data']: w.writerow( processFacebookComment(comment, status['status_id'])) if 'comments' in comment: has_next_subpage = True subcomments = getFacebookCommentFeedData( comment['id'], access_token, 100) while has_next_subpage: for subcomment in subcomments['data']: # print (processFacebookComment( # subcomment, status['status_id'], # comment['id'])) w.writerow( processFacebookComment( subcomment, status['status_id'], comment['id'])) num_processed += 1 if num_processed % 1000 == 0: print("%s Comments Processed: %s" % \ (num_processed, datetime.datetime.now())) if 'paging' in subcomments: if 'next' in subcomments['paging']: subcomments = json.loads( request_until_succeed( subcomments['paging']\ ['next'])) else: has_next_subpage = False else: has_next_subpage = False # output progress occasionally to make sure code is not # stalling num_processed += 1 if num_processed % 1000 == 0: print("%s Comments Processed: %s" % \ (num_processed, datetime.datetime.now())) if 'paging' in comments: if 'next' in comments['paging']: comments = json.loads( request_until_succeed( comments['paging']['next'])) else: has_next_page = False else: has_next_page = False print("\nDone!\n%s Comments Processed in %s" % \ (num_processed, datetime.datetime.now() - scrape_starttime))