def test_limit(self): warc_iter = TwitterRestWarcIter(self.filepaths, limit_user_ids=("481186914", "999999")) self.assertEqual(244, len(list(warc_iter))) warc_iter = TwitterRestWarcIter(self.filepaths, limit_user_ids=("999999", )) self.assertEqual(0, len(list(warc_iter)))
def test_no_limit(self): warc_iter = TwitterRestWarcIter(self.filepaths) tweets = list(warc_iter) self.assertEqual(1473, len(tweets)) self.assertEqual("721345764362948609", tweets[0][1]) # Datetime is aware self.assertIsNotNone(tweets[0][2].tzinfo)
def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id)
def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and (max_tweet_id or 0) > (since_id or 0): self.state_store.set_state( __name__, u"{}.since_id".format(self._search_id()), max_tweet_id)
def test_ignore_errors(self): self.assertEqual( 0, len( list( TwitterRestWarcIter._item_iter( None, 'https://api.twitter.com/1.1/statuses/' 'user_timeline.json', { 'errors': [{ 'message': 'Rate limit exceeded', 'code': 88 }] })))) self.assertEqual( 0, len( list( TwitterRestWarcIter._item_iter( None, 'https://api.twitter.com/1.1/statuses/' 'user_timeline.json', { 'request': '/1.1/statuses/user_timeline.json', 'error': 'Not authorized.' }))))
def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet)