示例#1
0
def fbConvert_created_date_dt_ToDate(since=None, until=None):
    '''
    From timestamp in_field, convert it to datetime and store it in out_field
    :param in_field:
    :param out_field:
    :param since:
    :param until:
    :return: {'matched':int, 'updated':int}
    '''
    # Get documents where date_time_id is not of type: date
    # ToDO: {'created_time_dt': {'$ne': {'$type': 'date'}}} doesn't work.
    q = {'created_time_dt': {'$ne': {'$type': 'date'}}}
    # Till above query is repaired I just select all, and keep track with a flag in the document
    q = {'flag': {'$ne': 1}}
    p = {'created_time_dt': 1, 'created_time': 1}
    cursor = fbGetPosts(q, p, since=since, until=until)
    # Update/create the created_time_dt field to have date type
    matched = 0
    modified = 0
    for i, post in enumerate(cursor):
        if i % 500: print i,
        q = {'_id': post['_id']}
        dt = timestamp_to_datetime(post['created_time']).isoformat()
        u = {
            '$set': {
                'created_time_dt': dt,
                'flag': 1
            }
        }  # Flag field to keep trak what was updated already
        result = fbUpdatePost(query=q, update=u)
        matched += 1
        modified += 1
    return {'matched': matched, 'modified': modified}
示例#2
0
    def run_scraping(self, pageidlist, resume=True, bulkdays=45):
        '''

        :param resume: bool: continue scraping starting at the latest time from the saved posts from page.
        :param bulkdays: int: max days to scrape posts from a page in one iteration
        :return: True
        '''
        bulk = datetime.timedelta(bulkdays)

        access_token = self.access_token
        if not pageidlist: pageidlist = self.pageidlist

        FS = FacebookScraping(access_token)
        MON = MongoDb(collection='facebook')
        for pageid in pageidlist:
            if resume:
                # get the last created_time for page
                lastpost = MON.get_last_post_for_page(pageid) #ToDo: Useless to retrieve everything. Use projection
                if lastpost:
                    since = lastpost['created_time']
                    since = timestamp_to_datetime(since) #ToDo: convert time to datetime and later again to timestamp
                else:
                    since = self.since
                print 'last post: ----------------->', since
                until = self.until
            else:
                since = self.since
                until = self.until
            print since, until
            s = since
            while until > s:
                u = s + bulk
                print 'Scraping page:{}, since:{}, until:{}'.format(pageid, s, u)
                FS.check_rate_limit()
                posts = FS.get_all_posts(pageid, since=datetime_to_timestamp(s), until=datetime_to_timestamp(u))
                if posts:
                    # store docs
                    results = MON.upsert_fb_documents(documents=posts)
                    print datetime.datetime.now(), 'Documents inserted: {}, selected: {}, updated or replaced: {}, upserted: {}'.format(
                        results.inserted_count, results.matched_count, results.modified_count, results.upserted_count)
                s += bulk
        return True
示例#3
0
    def get_all_posts(self,
                      page_id,
                      fields='all',
                      since=None,
                      until=None,
                      limit=100):
        """
        Gets all posts on a page, group or user
        :param page_id: string
        The unique id of the page, group or user
        :param fields: comma separated string, 'all', None
        A description of all fields can be found at: https://developers.facebook.com/docs/graph-api/reference/v2.8/post/
        Can be:
            - Comma separated string: with all fields that need to be retrieved.
            - 'all': comma separated string with default fields
            - None: facebook default fields

        :return: dict
        """
        if fields == 'all':
            # For a list of fields, see:
            # https://developers.facebook.com/docs/graph-api/reference/v2.8/post/
            fields = 'id, name,created_time, from, to, type, status_type, message, link, picture, story, shares'  # , likes,reactions'
            # ToDo: get_connections vs Get_objects. How to use limit

        chunk = self.graph.get_connections(page_id,
                                           connection_name='posts',
                                           fields=fields,
                                           date_format='U',
                                           since=since,
                                           until=until)
        # Add data to each post
        posts = []
        profile = self.get_page_profile(page_id)
        while True:  # get all chuncks of 25 posts for a page
            for i, post in enumerate(chunk['data']):
                print '{}/25\t Get data from "{}" for post: {}  \t'.format(
                    i, profile['name'], post['id']),
                post_id = post['id']
                post['profile'] = profile
                post['update_ts'] = datetime_to_timestamp(utc_now())
                post['update_dt'] = datetime.datetime.now(
                )  #ToDo: add tz=pytz.utc) ???
                post['comments'] = self.get_all_comments(post_id)
                post['reactions'] = self.get_all_reactions(post_id)
                post['created_time_dt'] = timestamp_to_datetime(
                    post['created_time'])  # local timezone
                posts.append(post)
                print 'comments:{}, \treactions:{}'.format(
                    len(post['comments']), len(post['reactions']))
            # Attempt to make a request to the next page of data, if it exists.
            try:
                chunk = requests.get(chunk['paging']['next']).json()
            except KeyError:  # When there are no more pages (['paging']['next']), break from the loop and end the script.
                break
        # the posts are sorted in decending order in the list [NEW,...,OLD].
        # This is not ideal.
        # When saving the posts in the database, NEW will be saved first. When crash, OLD is not saved. Resume scraping
        # will start from NEW and OLD will be missing in the database.
        # Reversing the order avoids executing a rollback of the inserted posts in case of crash.
        posts.reverse()
        return posts