def fbConvert_created_date_dt_ToDate(since=None, until=None): ''' From timestamp in_field, convert it to datetime and store it in out_field :param in_field: :param out_field: :param since: :param until: :return: {'matched':int, 'updated':int} ''' # Get documents where date_time_id is not of type: date # ToDO: {'created_time_dt': {'$ne': {'$type': 'date'}}} doesn't work. q = {'created_time_dt': {'$ne': {'$type': 'date'}}} # Till above query is repaired I just select all, and keep track with a flag in the document q = {'flag': {'$ne': 1}} p = {'created_time_dt': 1, 'created_time': 1} cursor = fbGetPosts(q, p, since=since, until=until) # Update/create the created_time_dt field to have date type matched = 0 modified = 0 for i, post in enumerate(cursor): if i % 500: print i, q = {'_id': post['_id']} dt = timestamp_to_datetime(post['created_time']).isoformat() u = { '$set': { 'created_time_dt': dt, 'flag': 1 } } # Flag field to keep trak what was updated already result = fbUpdatePost(query=q, update=u) matched += 1 modified += 1 return {'matched': matched, 'modified': modified}
def run_scraping(self, pageidlist, resume=True, bulkdays=45): ''' :param resume: bool: continue scraping starting at the latest time from the saved posts from page. :param bulkdays: int: max days to scrape posts from a page in one iteration :return: True ''' bulk = datetime.timedelta(bulkdays) access_token = self.access_token if not pageidlist: pageidlist = self.pageidlist FS = FacebookScraping(access_token) MON = MongoDb(collection='facebook') for pageid in pageidlist: if resume: # get the last created_time for page lastpost = MON.get_last_post_for_page(pageid) #ToDo: Useless to retrieve everything. Use projection if lastpost: since = lastpost['created_time'] since = timestamp_to_datetime(since) #ToDo: convert time to datetime and later again to timestamp else: since = self.since print 'last post: ----------------->', since until = self.until else: since = self.since until = self.until print since, until s = since while until > s: u = s + bulk print 'Scraping page:{}, since:{}, until:{}'.format(pageid, s, u) FS.check_rate_limit() posts = FS.get_all_posts(pageid, since=datetime_to_timestamp(s), until=datetime_to_timestamp(u)) if posts: # store docs results = MON.upsert_fb_documents(documents=posts) print datetime.datetime.now(), 'Documents inserted: {}, selected: {}, updated or replaced: {}, upserted: {}'.format( results.inserted_count, results.matched_count, results.modified_count, results.upserted_count) s += bulk return True
def get_all_posts(self, page_id, fields='all', since=None, until=None, limit=100): """ Gets all posts on a page, group or user :param page_id: string The unique id of the page, group or user :param fields: comma separated string, 'all', None A description of all fields can be found at: https://developers.facebook.com/docs/graph-api/reference/v2.8/post/ Can be: - Comma separated string: with all fields that need to be retrieved. - 'all': comma separated string with default fields - None: facebook default fields :return: dict """ if fields == 'all': # For a list of fields, see: # https://developers.facebook.com/docs/graph-api/reference/v2.8/post/ fields = 'id, name,created_time, from, to, type, status_type, message, link, picture, story, shares' # , likes,reactions' # ToDo: get_connections vs Get_objects. How to use limit chunk = self.graph.get_connections(page_id, connection_name='posts', fields=fields, date_format='U', since=since, until=until) # Add data to each post posts = [] profile = self.get_page_profile(page_id) while True: # get all chuncks of 25 posts for a page for i, post in enumerate(chunk['data']): print '{}/25\t Get data from "{}" for post: {} \t'.format( i, profile['name'], post['id']), post_id = post['id'] post['profile'] = profile post['update_ts'] = datetime_to_timestamp(utc_now()) post['update_dt'] = datetime.datetime.now( ) #ToDo: add tz=pytz.utc) ??? post['comments'] = self.get_all_comments(post_id) post['reactions'] = self.get_all_reactions(post_id) post['created_time_dt'] = timestamp_to_datetime( post['created_time']) # local timezone posts.append(post) print 'comments:{}, \treactions:{}'.format( len(post['comments']), len(post['reactions'])) # Attempt to make a request to the next page of data, if it exists. try: chunk = requests.get(chunk['paging']['next']).json() except KeyError: # When there are no more pages (['paging']['next']), break from the loop and end the script. break # the posts are sorted in decending order in the list [NEW,...,OLD]. # This is not ideal. # When saving the posts in the database, NEW will be saved first. When crash, OLD is not saved. Resume scraping # will start from NEW and OLD will be missing in the database. # Reversing the order avoids executing a rollback of the inserted posts in case of crash. posts.reverse() return posts