示例#1
0
    def fetch_tweet_by_id(self,
                          tweet_id=None,
                          write_to_handlers=[],
                          cmd_handlers=[],
                          bucket="tweets"):

        if not tweet_id:
            raise Exception("show_status: tweet_id cannot be None")

        tweet = None
        retry_cnt = MAX_RETRY_CNT
        while retry_cnt > 1:
            try:
                tweet = self.show_status(id=tweet_id)

                # logger.debug('%d > %d ? %s'%(prev_max_id, current_max_id, bool(prev_max_id > current_max_id)))
                logger.info("Fetched tweet [%s]" % (tweet_id))

                break

            except twython.exceptions.TwythonRateLimitError:
                self.rate_limit_error_occured('statuses', '/statuses/show')
            except twython.exceptions.TwythonError as te:
                if (te.error_code == 404 or te.error_code == 403):
                    logger.info("Tweet [%s] unavailable. Error code: %d" %
                                (tweet_id, te.error_code))

                    break
                else:
                    time.sleep(10)
                    logger.error("exception: %s" % (te))
                    retry_cnt -= 1
                    if (retry_cnt == 0):
                        raise MaxRetryReached("max retry reached due to %s" %
                                              (te))
            except Exception as exc:
                time.sleep(10)
                logger.error("exception: %s, %s" % (exc, type(exc)))
                retry_cnt -= 1
                if (retry_cnt == 0):
                    raise MaxRetryReached("max retry reached due to %s" %
                                          (exc))

        if (tweet != None):
            for handler in write_to_handlers:
                handler.append(json.dumps(tweet),
                               bucket=bucket,
                               key="tweetList")
        else:
            for handler in write_to_handlers:
                handler.append(json.dumps({"id": tweet_id}),
                               bucket=bucket,
                               key="tweetList")

        logger.debug("[%s] tweet fetched..." % tweet_id)
示例#2
0
    def find_all_friend_ids(self,
                            user_id=None,
                            write_to_handlers=[],
                            cmd_handlers=[],
                            bucket="friend_ids"):

        if (not user_id):
            raise MissingArgs("user_id cannot be None")

        retry_cnt = MAX_RETRY_CNT
        cursor = -1
        while cursor != 0 and retry_cnt > 1:
            try:
                friend_ids = self.get_friends_ids(user_id=user_id,
                                                  cursor=cursor,
                                                  count=200)

                for handler in write_to_handlers:
                    handler.append(json.dumps(friend_ids),
                                   bucket=bucket,
                                   key=user_id)

                for handler in cmd_handlers:
                    handler.append(json.dumps(friend_ids),
                                   bucket=bucket,
                                   key=user_id)

                cursor = int(friend_ids['next_cursor'])

                logger.debug("find #%d friend_ids... NEXT_CURSOR: %d" %
                             (len(friend_ids["ids"]), cursor))

                time.sleep(2)
            except twython.exceptions.TwythonRateLimitError:
                self.rate_limit_error_occured('friends', '/friends/ids')
            except Exception as exc:
                time.sleep(10)
                logger.debug("exception: %s" % exc)
                retry_cnt -= 1
                if (retry_cnt == 0):
                    raise MaxRetryReached("max retry reached due to %s" %
                                          (exc))

        logger.debug("finished find_all_friend_ids for %s..." % (user_id))
示例#3
0
    def search_by_query(self,
                        query=None,
                        geocode=None,
                        lang=None,
                        key=None,
                        write_to_handlers=[],
                        cmd_handlers=[],
                        bucket="tweets"):

        if not query:
            raise Exception("search: query cannot be None")

        if not key:
            key = md5(str(query).encode('utf-8'))

        logger.info("received query: %s " % query)

        prev_max_id = -1
        current_max_id = 0
        last_lowest_id = current_max_id  # used to workaround users who has less than 200 tweets, 1 loop is enough...
        cnt = 0

        retry_cnt = MAX_RETRY_CNT
        result_tweets = []
        while current_max_id != prev_max_id and retry_cnt > 1:
            try:
                if current_max_id > 0:
                    tweets = self.search(q=query,
                                         geocode=geocode,
                                         lang=lang,
                                         max_id=current_max_id - 1,
                                         count=100)
                else:
                    tweets = self.search(q=query,
                                         geocode=geocode,
                                         lang=lang,
                                         count=100)

                prev_max_id = current_max_id  # if no new tweets are found,
                #  the prev_max_id will be the same as current_max_id

                for tweet in tweets['statuses']:
                    if current_max_id == 0 or current_max_id > int(
                            tweet['id']):
                        current_max_id = int(tweet['id'])

                # no new tweets found
                if prev_max_id == current_max_id:
                    break

                result_tweets.extend(tweets['statuses'])

                cnt += len(tweets['statuses'])

                # logger.info(cnt)

                logger.debug('%d > %d ? %s' %
                             (prev_max_id, current_max_id,
                              bool(prev_max_id > current_max_id)))

                time.sleep(1)

            except twython.exceptions.TwythonRateLimitError:
                self.rate_limit_error_occured('search', '/search/tweets')
            except Exception as exc:
                time.sleep(10)
                logger.debug("exception: %s" % exc)
                retry_cnt -= 1
                if retry_cnt == 0:
                    raise MaxRetryReached("max retry reached due to %s" % exc)

        if len(result_tweets) > 0:
            for tweet in result_tweets:
                for handler in write_to_handlers:
                    handler.append(json.dumps(tweet), bucket=bucket, key=key)

                for handler in cmd_handlers:
                    handler.append(json.dumps(tweet), bucket=bucket, key=key)
        else:
            for handler in write_to_handlers:
                handler.append(json.dumps({}), bucket=bucket, key=key)

        logger.info("[%s] total tweets: %d " % (query, cnt))
示例#4
0
    def fetch_user_timeline(self,
                            user_id=None,
                            write_to_handlers=[],
                            cmd_handlers=[],
                            bucket="timelines"):

        if not user_id:
            raise Exception("user_timeline: user_id cannot be None")

        prev_max_id = -1
        current_max_id = 0
        last_lowest_id = current_max_id  # used to workaround users who has less than 200 tweets, 1 loop is enough...
        cnt = 0

        retry_cnt = MAX_RETRY_CNT
        timeline = [
        ]  # holder tweets in memory... you won't get more than 3,200 tweets per user,
        #  so I guess this is fine...
        while current_max_id != prev_max_id and retry_cnt > 1:
            try:
                if current_max_id > 0:
                    tweets = self.get_user_timeline(user_id=user_id,
                                                    max_id=current_max_id - 1,
                                                    count=200)
                else:
                    tweets = self.get_user_timeline(user_id=user_id, count=200)

                prev_max_id = current_max_id  # if no new tweets are found,
                # the prev_max_id will be the same as current_max_id

                for tweet in tweets:
                    if current_max_id == 0 or current_max_id > int(
                            tweet['id']):
                        current_max_id = int(tweet['id'])

                # no new tweets found
                if prev_max_id == current_max_id:
                    break

                timeline.extend(tweets)

                cnt += len(tweets)

                logger.debug('%d > %d ? %s' %
                             (prev_max_id, current_max_id,
                              bool(prev_max_id > current_max_id)))

                time.sleep(1)

            except twython.exceptions.TwythonRateLimitError:
                self.rate_limit_error_occured('statuses',
                                              '/statuses/user_timeline')
            except Exception as exc:
                time.sleep(10)
                logger.debug("exception: %s" % exc)
                retry_cnt -= 1
                if retry_cnt == 0:
                    raise MaxRetryReached("max retry reached due to %s" % exc)

        if len(timeline) > 0:
            for tweet in timeline:
                for handler in write_to_handlers:
                    handler.append(json.dumps(tweet),
                                   bucket=bucket,
                                   key=user_id)

                for handler in cmd_handlers:
                    handler.append(json.dumps(tweet),
                                   bucket=bucket,
                                   key=user_id)
        else:
            for handler in write_to_handlers:
                handler.append(json.dumps({}), bucket=bucket, key=user_id)

        logger.debug("[%s] total tweets: %d " % (user_id, cnt))