Exemplo n.º 1
0
    def test_rate_limit(self, api, wait=True, buffer=.1):
        """
        Tests whether the rate limit of the last request has been reached.
        :param api: The `tweepy` api instance.
        :param wait: A flag indicating whether to wait for the rate limit reset
                    if the rate limit has been reached.
        :param buffer: A buffer time in seconds that is added on to the waiting
                    time as an extra safety margin.
        :return: True if it is ok to proceed with the next request. False otherwise.
        """
        # Get the number of remaining requests
        remaining = int(api.last_response.headers['x-rate-limit-remaining'])
        # Check if we have reached the limit
        if remaining == 0:
            limit = int(api.last_response.headers['x-rate-limit-limit'])
            reset = int(api.last_response.headers['x-rate-limit-reset'])
            # Parse the UTC time
            reset = datetime.fromtimestamp(reset)
            # Let the user know we have reached the rate limit
            log.info("0 of %d requests remaining until %d.", limit, reset)

            if wait:
                # Determine the delay and sleep
                delay = (reset - datetime.now()).total_seconds() + buffer
                log.info("Sleeping for %d", delay)
                sleep(delay)
                # We have waited for the rate limit reset. OK to proceed.
                return True
            else:
                # We have reached the rate limit. The user needs to handle the
                # rate limit manually.
                return False

        # We have not reached the rate limit
        return True
    def save_tweet(self, twitter) -> None:
        """Save tweet data into database
        Will check if data is not exists then save it, if exists ignore it

        Args:
            twitter: tweepy status object
        """
        if isinstance(twitter, dict):
            json_data = twitter
        else:
            json_data = json.loads(twitter)

        try:
            breakpoint()
            self.db.tweets.find_one_and_update(
                {'id_str': json_data['id_str']},
                {'$inc': {
                    'seq': 1
                }},
                projection={
                    'seq': True,
                    '_id': False
                },
                upsert=True,
            )
        except Exception as e:
            log.error(e)
Exemplo n.º 3
0
def main(verbose: bool = False, config: str = None) -> None:
    """
    An entry point for twitter consumer
    """
    loglevel = 'DEBUG' if verbose else 'INFO'
    LOG.setLevel(loglevel)
    config_data = None
    if config:
        config_data = read_config(config)

    consumer = TweetConsumer(config_data)
    consumer.execute()
Exemplo n.º 4
0
def main(keywords: List[str],
         access_token: str,
         access_token_secret: str,
         config: str = None,
         api_type: str = None,
         token: int = 0,
         verbose: bool = False):
    """
    An entry point to twitter crawler application
    """
    loglevel = 'DEBUG' if verbose else 'INFO'
    LOG.setLevel(loglevel)
    LOG.info(msg=f"Argument {config} {api_type}")
    crawler_config = None
    if config:
        crawler_config = read_config(config)

    if keywords and access_token and access_token_secret:
        crawler_config = construct_config(keywords, access_token,
                                          access_token_secret)

    if crawler_config:
        LOG.debug(crawler_config)
        LOG.debug(f"Api Type - {api_type}")
        crawler = Crawler.create_crawler_instance(api_type, crawler_config,
                                                  int(token))
        crawler.execute()

    click.echo("Option is required")
Exemplo n.º 5
0
 def on_data(self, data):
     """ Method to passes data from statuses to the on_status method"""
     if 'in_reply_to_status' in data:
         self.on_status(data)
     elif 'delete' in data:
         delete = json.loads(data)['delete']['status']
         if self.on_delete(delete['id'], delete['user_id']) is False:
             return False
     elif 'limit' in data:
         if self.on_limit(json.loads(data)['limit']['track']) is False:
             return False
     elif 'warning' in data:
         warning = json.loads(data)['warnings']
         log.warning(warning['message'])
         return False
Exemplo n.º 6
0
 def execute(self):
     """Execute the twitter crawler, loop into the keyword_list"""
     while True:
         log.info("Start crawling back....")
         delay = 600
         for keyword in self.keyword_list:
             log.info('Crawl data for %s', keyword["keyword"])
             try:
                 self.crawl(keyword)
             except Exception as e:
                 log.error('Error in Crawling process', exc_info=True)
                 log.info("Sleeping for %ds", delay)
                 sleep(delay)
         # Sleep for 10 minutes after finishing crawl all of the keyword,
         # and start over again
         log.info("Sleeping for %ds...", delay)
         sleep(delay)
Exemplo n.º 7
0
    def save_tweet(self, twitter):
        """Save tweet data into database
        Will check if data is not exists then save it, if exists ignore it

        Args:
            twitter: tweepy status object
        """
        if isinstance(twitter, dict):
            json_data = twitter
        else:
            json_data = json.loads(twitter)

        parsed_tweet = self.parse_tweet(json_data)
        doc = self.dbase.get(parsed_tweet["id"])
        if doc is None:
            try:
                self.dbase.save(parsed_tweet)
            except Exception as e:
                log.error(e)
Exemplo n.º 8
0
    def get(self, size: int) -> List:
        tweets = []
        count = 0
        for method_frame, properties, body in self.channel.consume(
                self.queue_name):
            tweets.append(json.loads(body))

            # Acknowledge the message
            self.channel.basic_ack(method_frame.delivery_tag)

            count += 1

            if count == size:
                break

        requeued_messages = self.channel.cancel()
        log.info(f'Requeued {requeued_messages} messages')

        return tweets
Exemplo n.º 9
0
 def execute(self):
     """Execute the twitter crawler, loop into the keyword_list
     """
     stream = tweepy.Stream(self.auth, self.listener)
     loop = True
     while loop:
         try:
             log.info("Start stream tweets data")
             log.info(f"Area for stream -> {self.bounding_box}", )
             stream.filter(track=self.keywords,
                           locations=[94.9, -8.88, 140.9, 5.86])
             loop = False
             log.info("End stream tweets data")
         except Exception as e:
             log.error("There's an error, sleep for 10 minutes")
             log.error(e)
             loop = True
             stream.disconnect()
             time.sleep(600)
             continue
Exemplo n.º 10
0
 def on_status(self, status):
     """ Handle logic when the data coming """
     try:
         tweet = json.loads(status)
         self.tweets += 1
         self.messaging.publish(tweet)
         log.info(f"Count {self.tweets}")
     except Exception as e:
         log.error(e)
         log.error(status)
         self.on_timeout()
Exemplo n.º 11
0
    def execute(self):
        try:
            while True:
                tweets = self.messaging.get(size=50)

                # If there's not tweets, take a 2 mins sleep
                if len(tweets) == 0:
                    log.info("Waiting for 2mins on next tweet batch")
                    time.sleep(120)

                for tweet in tweets:
                    self.store.save_tweet(tweet)
                    log.info(f"Successfully store tweet: {tweet['id_str']}")
        except Exception as e:
            log.error(e)
Exemplo n.º 12
0
 def on_timeout(self):
     """ Handle time out when API reach its limit """
     log.info("API Reach its limit, sleep for 10 minutes")
     time.sleep(600)
     return