class TestTwitterHarvester(tests.TestCase): def setUp(self): self.working_path = tempfile.mkdtemp() self.harvester = TwitterHarvester(self.working_path) self.harvester.state_store = DictHarvestStateStore() self.harvester.message = base_search_message self.harvester.result = HarvestResult() self.harvester.stop_harvest_seeds_event = threading.Event() def tearDown(self): if os.path.exists(self.working_path): shutil.rmtree(self.working_path) @patch("twitter_harvester.Twarc", autospec=True) def test_search(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) mock_twarc.search.side_effect = [(tweet1, tweet2)] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_search_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5, tweet_mode="extended") self.assertEqual([call("gelman", geocode=None, since_id=None)], mock_twarc.search.mock_calls) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_incremental_search(self, twarc_class): message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True mock_twarc = MagicMock(spec=Twarc) mock_twarc.search.side_effect = [(tweet2,)] # Return mock_twarc when instantiating a twarc. twarc_class.side_effect = [mock_twarc] self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400) self.harvester.message = message self.harvester.harvest_seeds() twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5, tweet_mode="extended") self.assertEqual([call("gelman", geocode=None, since_id=605726286741434400)], mock_twarc.search.mock_calls) self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_new_search(self, mock_twarc_class): # The new search style has separate query and geocode parameters for search. However, the legacy # style is still accepted. mock_twarc = MagicMock(spec=Twarc) mock_twarc.search.side_effect = [(tweet1, tweet2)] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] search_message = copy.deepcopy(base_search_message) search_message["seeds"][0]["token"] = {"query": "gelman", "geocode": "38.899434,-77.036449,50mi"} self.harvester.message = search_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5, tweet_mode="extended") self.assertEqual([call("gelman", since_id=None, geocode="38.899434,-77.036449,50mi")], mock_twarc.search.mock_calls) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) # Expecting 2 user timelines. First returns 2 tweets. Second returns none. mock_twarc.timeline.side_effect = [(tweet1, tweet2), ()] # Expecting 2 calls to get for user lookup mock_response1 = MagicMock() mock_response1.status_code = 200 mock_response1.json.return_value = {"screen_name": "gwtweets", "protected": False} mock_response2 = MagicMock() mock_response2.status_code = 200 mock_response2.json.return_value = {"id_str": "9710852", "protected": False} mock_twarc.get.side_effect = [mock_response1, mock_response2] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_timeline_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5, tweet_mode="extended") self.assertEqual([call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None)], mock_twarc.timeline.mock_calls) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_incremental_user_timeline(self, twarc_class): message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True mock_twarc = MagicMock(spec=Twarc) # Expecting 2 timelines. First returns 1 tweets. Second returns none. mock_twarc.timeline.side_effect = [(tweet2,), ()] # Expecting 2 calls to get for user lookup mock_response1 = MagicMock() mock_response1.status_code = 200 mock_response1.json.return_value = {"screen_name": "gwtweets", "protected": False} mock_response2 = MagicMock() mock_response2.status_code = 200 mock_response2.json.return_value = {"id_str": "9710852", "protected": False} mock_twarc.get.side_effect = [mock_response1, mock_response2] # Return mock_twarc when instantiating a twarc. twarc_class.side_effect = [mock_twarc] self.harvester.message = message self.harvester.state_store.set_state("twitter_harvester", "timeline.28101965.since_id", 605726286741434400) self.harvester.harvest_seeds() twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5, tweet_mode="extended") self.assertEqual( [call(user_id="28101965", since_id=605726286741434400), call(user_id="9710852", since_id=None)], mock_twarc.timeline.mock_calls) self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline_with_missing_users(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) # Expecting 2 calls to user_lookup, both which return nothing mock_twarc.user_lookup.side_effect = [[], []] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] mock_response = MagicMock() mock_response.status_code = 404 mock_response.json.return_value = {"errors": [{"code": 50, "message": "User not found."}]} mock_twarc.get.side_effect = HTTPError(response=mock_response) message = copy.deepcopy(base_timeline_message) message["seeds"] = [ { "id": "seed_id1", "token": "missing1" }, { "id": "seed_id2", "token": "missing2" } ] self.harvester.message = message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5, tweet_mode="extended") self.assertEqual( [call('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'screen_name': 'missing1'}), call('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'screen_name': 'missing2'})], mock_twarc.get.mock_calls) self.assertEqual(2, len(self.harvester.result.warnings)) self.assertEqual(CODE_TOKEN_NOT_FOUND, self.harvester.result.warnings[0].code) self.assertEqual("seed_id1", self.harvester.result.warnings[0].extras["seed_id"]) def test_lookup_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 200 mock_response.json.return_value = {"screen_name": "justin_littman", "protected": False} mock_twarc.get.return_value = mock_response self.harvester.twarc = mock_twarc self.assertEqual(('OK', {'protected': False, 'screen_name': 'justin_littman'}), self.harvester._lookup_user(id="481186914", id_type="user_id")) mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'user_id': '481186914'}) def test_lookup_protected_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 200 mock_response.json.return_value = {"screen_name": "justin_littman", "protected": True} mock_twarc.get.return_value = mock_response self.harvester.twarc = mock_twarc self.assertEqual(('unauthorized', {'protected': True, 'screen_name': 'justin_littman'}), self.harvester._lookup_user(id="481186914", id_type="user_id")) mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'user_id': '481186914'}) def test_lookup_missing_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 404 mock_response.json.return_value = {"errors": [{"code": 50, "message": "User not found."}]} mock_twarc.get.side_effect = HTTPError(response=mock_response) self.harvester.twarc = mock_twarc self.assertEqual(('not_found', None), self.harvester._lookup_user(id="481186914", id_type="user_id")) mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'user_id': '481186914'}) def test_lookup_suspended_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 403 mock_response.json.return_value = {"errors": [{"code": 63, "message": "User has been suspended."}]} mock_twarc.get.side_effect = HTTPError(response=mock_response) self.harvester.twarc = mock_twarc self.assertEqual(('suspended', None), self.harvester._lookup_user(id="481186914", id_type="user_id")) mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'user_id': '481186914'}) def test_lookup_user_id(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 200 mock_response.json.return_value = {"user_id": "481186914", "protected": False} mock_twarc.get.return_value = mock_response self.harvester.twarc = mock_twarc self.assertEqual(('OK', {'protected': False, 'user_id': '481186914'}), self.harvester._lookup_user(id="justin_littman", id_type="screen_name")) mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'screen_name': 'justin_littman'}) def test_lookup_missing_user_id(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 404 mock_response.json.return_value = {"errors": [{"code": 50, "message": "User not found."}]} mock_twarc.get.side_effect = [HTTPError(response=mock_response)] self.harvester.twarc = mock_twarc self.assertEqual(('not_found', None), self.harvester._lookup_user(id="justin_littman", id_type="screen_name")) mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'screen_name': 'justin_littman'}) @staticmethod def _iter_items(items): # This is useful for mocking out a warc iter iter_items = [] for item in items: iter_items.append(IterItem(None, None, None, None, item)) return iter_items @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_search(self, iter_class): mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.message = base_search_message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual(None, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id")) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_search_incremental(self, iter_class): message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400) self.harvester.message = message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual(660065173563158500, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id")) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_user_timeline(self, iter_class): mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet1, tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.message = base_timeline_message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 2}, self.harvester.result.stats_summary()) iter_class.assert_called_once_with("test.warc.gz") # # Nothing added to state self.assertEqual(0, len(self.harvester.state_store.state)) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_incremental_user_timeline(self, iter_class): message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.message = message self.harvester.state_store.set_state("twitter_harvester", "timeline.481186914.since_id", 605726286741434400) self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual(660065173563158500, self.harvester.state_store.get_state("twitter_harvester", "timeline.481186914.since_id"))
class TestTwitterHarvester(tests.TestCase): def setUp(self): self.working_path = tempfile.mkdtemp() self.harvester = TwitterHarvester(self.working_path) self.harvester.state_store = DictHarvestStateStore() self.harvester.message = base_search_message self.harvester.result = HarvestResult() self.harvester.stop_harvest_seeds_event = threading.Event() def tearDown(self): if os.path.exists(self.working_path): shutil.rmtree(self.working_path) @patch("twitter_harvester.Twarc", autospec=True) def test_search(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) mock_twarc.search.side_effect = [(tweet1, tweet2)] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_search_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with( tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([call("gelman", since_id=None)], mock_twarc.search.mock_calls) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_incremental_search(self, twarc_class): message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True mock_twarc = MagicMock(spec=Twarc) mock_twarc.search.side_effect = [(tweet2, )] # Return mock_twarc when instantiating a twarc. twarc_class.side_effect = [mock_twarc] self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400) self.harvester.message = message self.harvester.harvest_seeds() twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([call("gelman", since_id=605726286741434400)], mock_twarc.search.mock_calls) self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) # Expecting 2 user timelines. First returns 2 tweets. Second returns none. mock_twarc.timeline.side_effect = [(tweet1, tweet2), ()] # Expecting 2 calls to user_lookup mock_twarc.user_lookup.side_effect = [[{ "screen_name": "gwtweets" }], [{ "id_str": "9710852" }]] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_timeline_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with( tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([ call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None) ], mock_twarc.timeline.mock_calls) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_incremental_user_timeline(self, twarc_class): message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True mock_twarc = MagicMock(spec=Twarc) # Expecting 2 timelines. First returns 1 tweets. Second returns none. mock_twarc.timeline.side_effect = [(tweet2, ), ()] # Expecting 2 calls to user_lookup mock_twarc.user_lookup.side_effect = [[{ "screen_name": "gwtweets" }], [{ "id_str": "9710852" }]] # Return mock_twarc when instantiating a twarc. twarc_class.side_effect = [mock_twarc] self.harvester.message = message self.harvester.state_store.set_state("twitter_harvester", "timeline.28101965.since_id", 605726286741434400) self.harvester.harvest_seeds() twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([ call(user_id="28101965", since_id=605726286741434400), call(user_id="9710852", since_id=None) ], mock_twarc.timeline.mock_calls) self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline_with_missing_users(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) # Expecting 2 calls to user_lookup, both which return nothing mock_twarc.user_lookup.side_effect = [[], []] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] message = copy.deepcopy(base_timeline_message) message["seeds"] = [{ "id": "seed_id1", "token": "missing1" }, { "id": "seed_id2", "token": "missing2" }] self.harvester.message = message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with( tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([ call(screen_names=("missing1", )), call(screen_names=("missing2", )) ], mock_twarc.user_lookup.mock_calls) self.assertEqual(2, len(self.harvester.result.warnings)) self.assertEqual(CODE_TOKEN_NOT_FOUND, self.harvester.result.warnings[0].code) self.assertEqual("seed_id1", self.harvester.result.warnings[0].extras["seed_id"]) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline_with_private_timeline(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 401 # Expecting 2 user timelines. First returns 2 tweets. Second returns a 404. mock_twarc.timeline.side_effect = [(tweet1, tweet2), HTTPError(response=mock_response)] # Expecting 2 calls to user_lookup mock_twarc.user_lookup.side_effect = [[{ "screen_name": "gwtweets" }], [{ "id_str": "9710852" }]] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_timeline_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with( tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([ call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None) ], mock_twarc.timeline.mock_calls) self.assertEqual(1, len(self.harvester.result.warnings)) self.assertEqual(CODE_TOKEN_UNAUTHORIZED, self.harvester.result.warnings[0].code) self.assertEqual("seed_id2", self.harvester.result.warnings[0].extras["seed_id"]) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) def test_lookup_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_twarc.user_lookup.side_effect = [[{ "screen_name": "justin_littman" }]] self.harvester.twarc = mock_twarc self.assertEqual("justin_littman", self.harvester._lookup_screen_name("481186914")) mock_twarc.user_lookup.assert_called_once_with( user_ids=("481186914", )) def test_lookup_missing_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 404 mock_twarc.user_lookup.side_effect = [ HTTPError(response=mock_response) ] self.harvester.twarc = mock_twarc self.assertIsNone(self.harvester._lookup_screen_name("481186914")) mock_twarc.user_lookup.assert_called_once_with( user_ids=("481186914", )) def test_lookup_user_id(self): mock_twarc = MagicMock(spec=Twarc) mock_twarc.user_lookup.side_effect = [[{"id_str": "481186914"}]] self.harvester.twarc = mock_twarc self.assertEqual("481186914", self.harvester._lookup_user_id("justin_littman")) mock_twarc.user_lookup.assert_called_once_with( screen_names=("justin_littman", )) def test_lookup_missing_user_id(self): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 404 mock_twarc.user_lookup.side_effect = [ HTTPError(response=mock_response) ] self.harvester.twarc = mock_twarc self.assertIsNone(self.harvester._lookup_user_id("justin_littman")) mock_twarc.user_lookup.assert_called_once_with( screen_names=("justin_littman", )) @staticmethod def _iter_items(items): # This is useful for mocking out a warc iter iter_items = [] for item in items: iter_items.append(IterItem(None, None, None, None, item)) return iter_items def test_harvest_options_web(self): self.harvester.extract_media = False self.harvester.extract_web_resources = True self.harvester.extract_user_profile_images = False # This would normally be passed a warc iter. self.harvester._process_tweets( self._iter_items([tweet2, tweet3, tweet4, tweet5])) self.assertSetEqual( { 'http://bit.ly/1ipwd0B', # url 'http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html', # from retweet 'http://bit.ly/1NoNeBF' # from base tweet of quoted status }, self.harvester.result.urls_as_set()) def test_harvest_options_media(self): self.harvester.extract_media = True self.harvester.extract_web_resources = False self.harvester.extract_user_profile_images = False self.harvester._process_tweets( self._iter_items([tweet2, tweet3, tweet4, tweet5])) self.assertSetEqual( { 'http://pbs.twimg.com/tweet_video_thumb/Chn_42fWwAASuva.jpg', # media/extended entity 'http://pbs.twimg.com/media/Bv4ekbqIYAAcmXY.jpg', # from quoted status }, self.harvester.result.urls_as_set()) def test_harvest_options_user__images(self): self.harvester.extract_media = False self.harvester.extract_web_resources = False self.harvester.extract_user_profile_images = True self.harvester._process_tweets(self._iter_items([tweet2])) self.assertSetEqual( { 'http://pbs.twimg.com/profile_images/496478011533713408/GjecBUNj_normal.jpeg', 'http://abs.twimg.com/images/themes/theme1/bg.png' }, self.harvester.result.urls_as_set()) def test_default_harvest_options(self): self.harvester.extract_media = False self.harvester.extract_web_resources = False self.harvester._process_tweets( self._iter_items([tweet2, tweet3, tweet4, tweet5])) self.assertSetEqual(set(), self.harvester.result.urls_as_set()) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_search(self, iter_class): mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([tweet2]).__iter__() ] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.message = base_search_message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) self.assertEqual(0, len(self.harvester.result.urls_as_set())) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual( None, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id")) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_search_incremental(self, iter_class): message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True self.harvester.extract_media = False self.harvester.extract_web_resources = True mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([tweet2]).__iter__() ] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400) self.harvester.message = message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual( 660065173563158500, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id")) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_user_timeline(self, iter_class): mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([tweet1, tweet2]).__iter__() ] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.extract_media = False self.harvester.extract_web_resources = True self.harvester.message = base_timeline_message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 2}, self.harvester.result.stats_summary()) self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") # # Nothing added to state self.assertEqual(0, len(self.harvester.state_store.state)) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_incremental_user_timeline(self, iter_class): message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([tweet2]).__iter__() ] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.extract_media = False self.harvester.extract_web_resources = True self.harvester.message = message self.harvester.state_store.set_state("twitter_harvester", "timeline.481186914.since_id", 605726286741434400) self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual( 660065173563158500, self.harvester.state_store.get_state( "twitter_harvester", "timeline.481186914.since_id"))
class TestTwitterHarvester(tests.TestCase): def setUp(self): self.working_path = tempfile.mkdtemp() self.harvester = TwitterHarvester(self.working_path) self.harvester.state_store = DictHarvestStateStore() self.harvester.message = base_search_message self.harvester.result = HarvestResult() self.harvester.stop_harvest_seeds_event = threading.Event() def tearDown(self): if os.path.exists(self.working_path): shutil.rmtree(self.working_path) @patch("twitter_harvester.Twarc", autospec=True) def test_search(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) mock_twarc.search.side_effect = [(tweet1, tweet2)] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_search_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([call("gelman", since_id=None)], mock_twarc.search.mock_calls) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_incremental_search(self, twarc_class): message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True mock_twarc = MagicMock(spec=Twarc) mock_twarc.search.side_effect = [(tweet2,)] # Return mock_twarc when instantiating a twarc. twarc_class.side_effect = [mock_twarc] self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400) self.harvester.message = message self.harvester.harvest_seeds() twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([call("gelman", since_id=605726286741434400)], mock_twarc.search.mock_calls) self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) # Expecting 2 user timelines. First returns 2 tweets. Second returns none. mock_twarc.timeline.side_effect = [(tweet1, tweet2), ()] # Expecting 2 calls to user_lookup mock_twarc.user_lookup.side_effect = [[{"screen_name": "gwtweets"}], [{"id_str": "9710852"}]] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_timeline_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None)], mock_twarc.timeline.mock_calls) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_incremental_user_timeline(self, twarc_class): message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True mock_twarc = MagicMock(spec=Twarc) # Expecting 2 timelines. First returns 1 tweets. Second returns none. mock_twarc.timeline.side_effect = [(tweet2,), ()] # Expecting 2 calls to user_lookup mock_twarc.user_lookup.side_effect = [[{"screen_name": "gwtweets"}], [{"id_str": "9710852"}]] # Return mock_twarc when instantiating a twarc. twarc_class.side_effect = [mock_twarc] self.harvester.message = message self.harvester.state_store.set_state("twitter_harvester", "timeline.28101965.since_id", 605726286741434400) self.harvester.harvest_seeds() twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual( [call(user_id="28101965", since_id=605726286741434400), call(user_id="9710852", since_id=None)], mock_twarc.timeline.mock_calls) self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline_with_missing_users(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) # Expecting 2 calls to user_lookup, both which return nothing mock_twarc.user_lookup.side_effect = [[], []] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] message = copy.deepcopy(base_timeline_message) message["seeds"] = [ { "id": "seed_id1", "token": "missing1" }, { "id": "seed_id2", "token": "missing2" } ] self.harvester.message = message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([call(screen_names=("missing1",)), call(screen_names=("missing2",))], mock_twarc.user_lookup.mock_calls) self.assertEqual(2, len(self.harvester.result.warnings)) self.assertEqual(CODE_TOKEN_NOT_FOUND, self.harvester.result.warnings[0].code) @patch("twitter_harvester.Twarc", autospec=True) def test_user_timeline_with_private_timeline(self, mock_twarc_class): mock_twarc = MagicMock(spec=Twarc) mock_response = MagicMock() mock_response.status_code = 401 # Expecting 2 user timelines. First returns 2 tweets. Second returns a 404. mock_twarc.timeline.side_effect = [(tweet1, tweet2), HTTPError(response=mock_response)] # Expecting 2 calls to user_lookup mock_twarc.user_lookup.side_effect = [[{"screen_name": "gwtweets"}], [{"id_str": "9710852"}]] # Return mock_twarc when instantiating a twarc. mock_twarc_class.side_effect = [mock_twarc] self.harvester.message = base_timeline_message self.harvester.harvest_seeds() mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET, tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET, http_errors=5, connection_errors=5) self.assertEqual([call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None)], mock_twarc.timeline.mock_calls) self.assertEqual(1, len(self.harvester.result.warnings)) self.assertEqual(CODE_TOKEN_UNAUTHORIZED, self.harvester.result.warnings[0].code) self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter) def test_lookup_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_twarc.user_lookup.side_effect = [[{"screen_name": "justin_littman"}]] self.harvester.twarc = mock_twarc self.assertEqual("justin_littman", self.harvester._lookup_screen_name("481186914")) mock_twarc.user_lookup.assert_called_once_with(user_ids=("481186914",)) def test_lookup_missing_screen_name(self): mock_twarc = MagicMock(spec=Twarc) mock_twarc.user_lookup.side_effect = [[]] self.harvester.twarc = mock_twarc self.assertIsNone(self.harvester._lookup_screen_name("481186914")) mock_twarc.user_lookup.assert_called_once_with(user_ids=("481186914",)) def test_lookup_user_id(self): mock_twarc = MagicMock(spec=Twarc) mock_twarc.user_lookup.side_effect = [[{"id_str": "481186914"}]] self.harvester.twarc = mock_twarc self.assertEqual("481186914", self.harvester._lookup_user_id("justin_littman")) mock_twarc.user_lookup.assert_called_once_with(screen_names=("justin_littman",)) def test_lookup_missing_user_id(self): mock_twarc = MagicMock(spec=Twarc) mock_twarc.user_lookup.side_effect = [[]] self.harvester.twarc = mock_twarc self.assertIsNone(self.harvester._lookup_user_id("justin_littman")) mock_twarc.user_lookup.assert_called_once_with(screen_names=("justin_littman",)) @staticmethod def _iter_items(items): # This is useful for mocking out a warc iter iter_items = [] for item in items: iter_items.append(IterItem(None, None, None, None, item)) return iter_items def test_harvest_options_web(self): self.harvester.extract_media = False self.harvester.extract_web_resources = True self.harvester.extract_user_profile_images = False # This would normally be passed a warc iter. self.harvester._process_tweets(self._iter_items([tweet2, tweet3, tweet4, tweet5])) self.assertSetEqual({'http://bit.ly/1ipwd0B', # url 'http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html', # from retweet 'http://bit.ly/1NoNeBF' # from base tweet of quoted status }, self.harvester.result.urls_as_set()) def test_harvest_options_media(self): self.harvester.extract_media = True self.harvester.extract_web_resources = False self.harvester.extract_user_profile_images = False self.harvester._process_tweets(self._iter_items([tweet2, tweet3, tweet4, tweet5])) self.assertSetEqual({ 'http://pbs.twimg.com/tweet_video_thumb/Chn_42fWwAASuva.jpg', # media/extended entity 'http://pbs.twimg.com/media/Bv4ekbqIYAAcmXY.jpg', # from quoted status }, self.harvester.result.urls_as_set()) def test_harvest_options_user__images(self): self.harvester.extract_media = False self.harvester.extract_web_resources = False self.harvester.extract_user_profile_images = True self.harvester._process_tweets(self._iter_items([tweet2])) self.assertSetEqual({ 'http://pbs.twimg.com/profile_images/496478011533713408/GjecBUNj_normal.jpeg', 'http://abs.twimg.com/images/themes/theme1/bg.png' }, self.harvester.result.urls_as_set()) def test_default_harvest_options(self): self.harvester.extract_media = False self.harvester.extract_web_resources = False self.harvester._process_tweets(self._iter_items([tweet2, tweet3, tweet4, tweet5])) self.assertSetEqual(set(), self.harvester.result.urls_as_set()) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_search(self, iter_class): mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.message = base_search_message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) self.assertEqual(0, len(self.harvester.result.urls_as_set())) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual(None, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id")) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_search_incremental(self, iter_class): message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True self.harvester.extract_media = False self.harvester.extract_web_resources = True mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400) self.harvester.message = message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual(660065173563158500, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id")) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_user_timeline(self, iter_class): mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet1, tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.extract_media = False self.harvester.extract_web_resources = True self.harvester.message = base_timeline_message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 2}, self.harvester.result.stats_summary()) self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") # # Nothing added to state self.assertEqual(0, len(self.harvester.state_store.state)) @patch("twitter_harvester.TwitterRestWarcIter", autospec=True) def test_process_incremental_user_timeline(self, iter_class): message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True mock_iter = MagicMock(spec=TwitterRestWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()] # Return mock_iter when instantiating a TwitterRestWarcIter. iter_class.side_effect = [mock_iter] self.harvester.extract_media = False self.harvester.extract_web_resources = True self.harvester.message = message self.harvester.state_store.set_state("twitter_harvester", "timeline.481186914.since_id", 605726286741434400) self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary()) self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual(660065173563158500, self.harvester.state_store.get_state("twitter_harvester", "timeline.481186914.since_id"))