def check_rate_limit(self, store=True, pause=True): # ToDo: Ugly processing !!! # Check the response header of an api call to know if rate is limited base = 'https://graph.facebook.com/v2.8' pageid = '270994524621' access = '&access_token={}'.format(self.access_token) node = '/{}/posts'.format(pageid) fields = '?fields=id,created_time' url = base + node + fields + access r = requests.get(url) timestamp = utc_now() rate_limit = r.headers.get('x-app-usage') while rate_limit: doc = { 'type': 'fb_rate_limit', 'status': r.status_code, 'timestamp': timestamp, 'rate_limit': rate_limit } if store: save_log(doc) print 'Rate is limited !, Pause for 60 sec' print doc time.sleep(60) r = requests.get(url) timestamp = utc_now() rate_limit = r.headers.get('x-app-usage') return
def get_all_comments(self, id, limit=500): if is_blacklisted(id): print '\nComment blacklisted: {}'.format(id) return [{'blacklisted': id}] # ToDo: get likes for comments # ToDo: get_object doesn't retrieve enough fields. #ToDo: Try to correct malformed comment ids. Ex. '103536906359022:898483933530978:10103399554941441_898491433530228' fields = 'id,created_time,comments.limit(%d){attachment,comment_count,from,id,like_count,message,likes{pic,link,name,username,picture{url,is_silhouette}},created_time}' % limit try: comments = self.graph.get_object(id, fields=fields, date_format='U') except: doc = {'id': id, 'timestamp': utc_now(), 'type': 'message'} save_blacklist_doc(doc=doc) print '\nBlacklisted: {}'.format(id) self.blacklist = get_blacklist( ) # Update the blacklist in the FacebookScraping instance return [{'blacklisted': id}] allcomments = [] while True: try: for comment in comments['comments']['data']: allcomments.append(comment) if comment['comment_count'] > 0: # Get sub comments sub_comment = self.get_all_comments(comment['id']) allcomments = allcomments + sub_comment # When there are no more pages (['paging']['next']), break from the loop and end the script. comments = requests.get( comments['comments']['paging']['next']).json() except KeyError: break return allcomments
def __init__(self, pageidlist=None, access_token=None, since=None, until=None): """ :param pageidlist: [str, str, ...] :param access_token: :param since: (yyyy,m,d): local date :param until: (yyyy,m,d): local date """ MON = MongoDb(collection='facebook') if not access_token: self.access_token = FB_APP_ID + "|" + FB_APP_SECRET else: self.access_token = access_token if not pageidlist: self.pageidlist = PAGE_LIST else: self.pageidlist = pageidlist # convert local date (yyyy,mm,dd) into utc datetime if not since: self.since = yyyymmdd_to_datetime(2000, 1, 1, 0, 0, 0) else: self.since = yyyymmdd_to_datetime(since[0], since[1], since[2], 0, 0, 0) if not until: self.until = utc_now() else: self.until = yyyymmdd_to_datetime(until[0], until[1], until[2], 23, 59, 59)
def get_posts_from_page(self, pageid, since=0, until=utc_now(), limit=0, filtr={}, projection=None): """ :param pageid: :param since: :param until: :param filtr: :param projection: :return: """ # filtr['created_time'] = {'$gte': since, '$lt': until} posts = self.collection.find(filtr, projection).limit(limit) return posts
def __init__(self, pageid, since, till): self.pageid = pageid if not since: self.since = 0 if not till: self.till = utc_now() self.all_posts = None # Create document template day_stat_data_template = { 'reactions': { 'like': 0, 'love': 0, 'haha': 0, 'wow': 0, 'sad': 0, 'angry': 0 }, 'comments': { 'nr': 0, 'likes': 0 } } day_stat = {'{i}': day_stat_template for i in range(1, 32)} self.doc = {'id': pageid, 'date': 0, 'day_stats': day_stat} pass
bulkdays = 3 lst = 'POLITICS' elif list_nr == 2: page_ids = pageids_news bulkdays = 1 lst = 'NEWS' elif list_nr > 1000: page_ids = ['{}'.format(list_nr)] bulkdays = int(raw_input('Enter bulkdays: ')) lst = list_nr else: page_ids = ['37823307325'] bulkdays = 2 lst = 'TEST' i = 0 print '\n{} Start scraping list: {}. Attempt: {}\n'.format(utc_now(), lst, i) while i < 10: try: result = ENG.run_scraping(pageidlist=page_ids, resume=resume, bulkdays=bulkdays) if result: print '{} End scraping list {}'.format(utc_now(), lst) break except: e = sys.exc_info()[0] print e i += 1 time.sleep(600) # ENG.run_scraping(pageidlist=['446368145415026'], resume=False, bulkdays=30)
def get_all_posts(self, page_id, fields='all', since=None, until=None, limit=100): """ Gets all posts on a page, group or user :param page_id: string The unique id of the page, group or user :param fields: comma separated string, 'all', None A description of all fields can be found at: https://developers.facebook.com/docs/graph-api/reference/v2.8/post/ Can be: - Comma separated string: with all fields that need to be retrieved. - 'all': comma separated string with default fields - None: facebook default fields :return: dict """ if fields == 'all': # For a list of fields, see: # https://developers.facebook.com/docs/graph-api/reference/v2.8/post/ fields = 'id, name,created_time, from, to, type, status_type, message, link, picture, story, shares' # , likes,reactions' # ToDo: get_connections vs Get_objects. How to use limit chunk = self.graph.get_connections(page_id, connection_name='posts', fields=fields, date_format='U', since=since, until=until) # Add data to each post posts = [] profile = self.get_page_profile(page_id) while True: # get all chuncks of 25 posts for a page for i, post in enumerate(chunk['data']): print '{}/25\t Get data from "{}" for post: {} \t'.format( i, profile['name'], post['id']), post_id = post['id'] post['profile'] = profile post['update_ts'] = datetime_to_timestamp(utc_now()) post['update_dt'] = datetime.datetime.now( ) #ToDo: add tz=pytz.utc) ??? post['comments'] = self.get_all_comments(post_id) post['reactions'] = self.get_all_reactions(post_id) post['created_time_dt'] = timestamp_to_datetime( post['created_time']) # local timezone posts.append(post) print 'comments:{}, \treactions:{}'.format( len(post['comments']), len(post['reactions'])) # Attempt to make a request to the next page of data, if it exists. try: chunk = requests.get(chunk['paging']['next']).json() except KeyError: # When there are no more pages (['paging']['next']), break from the loop and end the script. break # the posts are sorted in decending order in the list [NEW,...,OLD]. # This is not ideal. # When saving the posts in the database, NEW will be saved first. When crash, OLD is not saved. Resume scraping # will start from NEW and OLD will be missing in the database. # Reversing the order avoids executing a rollback of the inserted posts in case of crash. posts.reverse() return posts