예제 #1
0
    def getComments(self, postid, limit=100, **kwargs):

        if type(postid) != str:
            postid = str(postid)

        if limit > 7500:
            limit = 7500

        url = self._get_api_url() + '/{}/comments'.format(postid)
        url = self._prepare_url(
            url +
            '?fields={}&limit={}'.format(','.join(COMMENT_FIELDS), limit))
        has_next_page = True
        num_processed = 0
        after = ''
        comments = []

        while has_next_page:
            after = '' if after is '' else "&after={}".format(after)
            base_url = url + after
            response = self._request_until_succeed(base_url)

            trans_comments = translate().process_comments(response['data'])
            comments = comments + trans_comments
            num_processed = num_processed + len(trans_comments)

            if 'paging' in response:
                after = response['paging']['cursors']['after']
            else:
                has_next_page = False

            if num_processed >= limit:
                return comments

        return comments
예제 #2
0
    def getvideos(self,q, n=10,nextPageToken=None):

        flag = True
        self.videos = {}
        self.channels = {}
        video_counter = 0


        batch = self.youtube.new_batch_http_request()

        if n < self.DEFAULT_VIDEO_SIZE:
            DEFAULT_VIDEO_SIZE = n

        while flag:

            if nextPageToken:
                search_response = self.youtube.search().list(q=q, part="id,snippet", maxResults=DEFAULT_VIDEO_SIZE, type='video',
                                                        pageToken=nextPageToken).execute()
            else:
                search_response = self.youtube.search().list(q=q, part="id,snippet", maxResults=DEFAULT_VIDEO_SIZE, type='video',
                                                        ).execute()


            nextPageToken = search_response.get('nextPageToken')

            if not nextPageToken:
                flag = False



            for search_result in search_response:
                if 'items' in search_result:
                    video_counter += len(search_response[search_result])
                    for item in search_response[search_result]:
                        self.videos[item['id']['videoId']] = item

                        self.channels[item['snippet']['channelId']] = item

                        batch.add(self.youtube.commentThreads().list(part=self.commentSnippet,
                                                                videoId=item['id']['videoId'],
                                                                maxResults=50,
                                                                textFormat='plainText'),callback=self.comments)
                        batch.add(self.youtube.videos().list(part=self.videoSnippet,
                                                        id=item['id']['videoId']),callback=self.videoStats)
                        batch.add(self.youtube.channels().list(part=self.channelSnippet,
                                                          id=item['snippet']['channelId'],
                                                          maxResults=30),callback=self.channelInfo)

            batch.execute()

            if video_counter >= n:
                break

        videos = self.delete_keys_from_dict(self.videos,self.DELETE_FIELDS)
        videos = list(videos.values())
        videos = translate().process_videos(data=videos)
        return videos
예제 #3
0
 def insert_tweet_data(self,data):
     tweets = translate().process_tweets(data)
     grouped_tweets = groupby(tweets, lambda x: x.get('searchKey'))
     for key, value in grouped_tweets:
         inserted_ids = inserttweets(db=db, data=list(value))
         if len(inserted_ids) > 0:
             min_id = inserted_ids[0]
             max_id = inserted_ids[len(inserted_ids)-1]
             log.info("{} No of Documents with {} inserted Successfully ... ".format(len(inserted_ids),key))
             self.change_alert_record(key, min_id, max_id, len(list(inserted_ids)))
예제 #4
0
    def getTweets(self, q, limit=100, **kwargs):

        logger.info("q param {}".format(q))
        flag = True
        tweets = []
        count = 0
        max_id = None
        kwargs['q'] = q
        kwargs['count'] = 100

        while (flag):
            current_tweet_ids = []
            if max_id:
                kwargs['max_id'] = max_id
                url = self.buildUrl(SEARCH_URL, **kwargs)
                data, resp = self.oauth_req(url)
            else:
                url = self.buildUrl(SEARCH_URL, **kwargs)
                data, resp = self.oauth_req(url)

            data = json.loads(data)

            length = len(data['statuses'])

            count = count + length

            if length == 0:
                flag = False
            else:
                for tweet in data['statuses']:
                    current_tweet_ids.append(tweet.get('id'))
                max_id = min(current_tweet_ids)

            tweets = tweets + translate().process_tweets(data['statuses'])

            if count > limit:
                return tweets[:limit]

        if count < limit:
            logger.warning(
                "Requested {} Tweets but got {} tweets from Api ".format(
                    limit, len(tweets)))
        return tweets
예제 #5
0
    def get_user_tweets(self, q, ids, limit=50, **kwargs):

        tweets_final = []
        logger.info("userid param {}".format(ids))

        for id in ids:

            logger.info("userid param {}".format(id))
            print("userid param {}".format(id))
            flag = True
            tweets = []
            count = 0
            max_id = None
            kwargs['user_id'] = id
            kwargs['count'] = 50

            while (flag):
                current_tweet_ids = []
                if max_id:
                    kwargs['max_id'] = max_id
                    url = self.buildUrl(USER_TIME_LINE, **kwargs)
                    data, resp = self.oauth_req(url)

                else:
                    url = self.buildUrl(USER_TIME_LINE, **kwargs)
                    data, resp = self.oauth_req(url)
                data = json.loads(data)
                print(
                    "the count of total tweets are **************************",
                    count)

                length = len(data)

                if length == 0:
                    flag = False
                else:
                    for tweet in data:
                        current_tweet_ids.append(tweet.get('id'))
                    max_id = min(current_tweet_ids)

                    data = list(
                        filter(lambda x: json.dumps(x).rfind(q) != -1, data))
                    length = len(data)

                    count = count + length

                    if length == 0:
                        flag = False
                    else:
                        for tweet in data:
                            current_tweet_ids.append(tweet.get('id'))
                        max_id = min(current_tweet_ids)

                    tweets = tweets + translate().process_tweets(data)

                    if count > limit:
                        flag = False

            if count < limit:
                logger.warning(
                    "Requested {} Tweets but got {} tweets from Api ".format(
                        limit, len(tweets)))

            tweets_final += tweets

        return tweets_final
예제 #6
0
    def getPosts(self, pageid, limit=100, commentLimit=50, **kwargs):

        if type(pageid) != str:
            pageid = str(pageid)

        page = self.getPage(pageid=pageid)
        url = self._get_api_url() + '/{}/posts'.format(pageid)

        if limit < 100:
            url = self._prepare_url(
                url +
                '?fields={}&limit={}'.format(','.join(POST_FIELDS), limit))
        else:
            url = self._prepare_url(
                url + '?fields={}&limit={}'.format(','.join(POST_FIELDS), 100))

        has_next_page = True
        num_processed = 0

        after = ''

        if 'since' in kwargs:
            since = "&since={}".format(kwargs['since'])
        else:
            since = "&since={}".format('')

        if 'until' in kwargs:
            until = "&until={}".format(kwargs['until'])
        else:
            until = "&until={}".format('')

        logger.info("Scraping Facebook Page:{} ".format(pageid))

        posts = []
        posts_indexed = {}

        def indexposts(posts):
            for post in posts:
                posts_indexed[post.get('id')] = post

        while has_next_page:
            after = '' if after is '' else "&after={}".format(after)
            base_url = url + after + since + until

            response = self._request_until_succeed(base_url)
            if response:
                statuses = translate().process_posts(data=response['data'])

            else:
                statuses = []

            indexposts(statuses)
            # if commentLimit > 50 :
            #     post_ids = [status.get('id') for status in statuses]
            #     comments = self.getCommentsPosts(post_ids,limit=commentLimit)
            #     for key,value in comments.items():
            #         post = posts_indexed[key]
            #         post['comments']['data'] = value
            #         posts_indexed[key] = post

            if commentLimit > 50:
                logger.info("user asked comments above default size")
            else:
                logger.info("Going with default comment limit")

            for status in list(posts_indexed.values()):
                comments = translate().process_comments(
                    status['comments']['data'])
                status['comments']['data'] = comments
                status['page'] = page
                posts.append(Post(data=status))
                num_processed += 1
                if num_processed % 100 == 0:
                    print(("{} Statuses Processed: {}".format(
                        num_processed, datetime.datetime.now())))

            # if there is no next page, we're done.
            if 'paging' in response:
                after = response['paging']['cursors']['after']
            else:
                has_next_page = False

            posts_indexed = {}

            if num_processed >= limit:
                return posts[:limit]

        if num_processed < limit:
            logger.warn('Required {} but got {} from page {}'.format(
                limit, num_processed, pageid))

        return posts