class TestWeiboHarvesterVCR(tests.TestCase): def setUp(self): self.working_path = tempfile.mkdtemp() self.harvester = WeiboHarvester(self.working_path) self.harvester.state_store = DictHarvestStateStore() self.harvester.result = HarvestResult() self.harvester.stop_harvest_seeds_event = threading.Event() self.harvester.message = { "id": "test:2", "type": "weibo_timeline", "path": "/collections/test_collection_set", "credentials": {"access_token": tests.WEIBO_ACCESS_TOKEN}, "collection_set": {"id": "test_collection_set"}, "collection": {"id": "test_collection"}, "options": {}, } def tearDown(self): if os.path.exists(self.working_path): shutil.rmtree(self.working_path) @vcr.use_cassette(filter_query_parameters=["access_token"]) def test_search_vcr(self): self.harvester.harvest_seeds() # check the total number, for new users don't how to check self.assertEqual(self.harvester.result.harvest_counter["weibos"], 181) # check the harvester status self.assertTrue(self.harvester.result.success) @vcr.use_cassette(filter_query_parameters=["access_token"]) def test_incremental_search_vcr(self): self.harvester.message["options"]["incremental"] = True collection_set_id = self.harvester.message["collection_set"]["id"] self.harvester.state_store.set_state( "weibo_harvester", u"{}.since_id".format(collection_set_id), 3935747172100551 ) self.harvester.harvest_seeds() # Check harvest result self.assertTrue(self.harvester.result.success) # for check the number of get self.assertEqual(self.harvester.result.harvest_counter["weibos"], 5)
class TestWeiboHarvester(tests.TestCase): def setUp(self): self.working_path = tempfile.mkdtemp() self.harvester = WeiboHarvester(self.working_path) self.harvester.state_store = DictHarvestStateStore() self.harvester.result = HarvestResult() self.harvester.stop_harvest_seeds_event = threading.Event() self.harvester.message = { "id": "test:1", "type": "weibo_timeline", "path": "/collections/test_collection_set", "credentials": {"access_token": tests.WEIBO_ACCESS_TOKEN}, "collection_set": {"id": "test_collection_set"}, "collection": {"id": "test_collection"}, "options": {"web_resources": True, "image_sizes": ["Large"]}, } def tearDown(self): if os.path.exists(self.working_path): shutil.rmtree(self.working_path) @patch("weibo_harvester.Weiboarc", autospec=True) def test_search_timeline(self, mock_weiboarc_class): mock_weiboarc = MagicMock(spec=Weiboarc) # Expecting 2 results. First returns 1tweets. Second returns none. mock_weiboarc.search_friendships.side_effect = [(weibo1, weibo2), ()] # Return mock_weiboarc when instantiating a weiboarc. mock_weiboarc_class.side_effect = [mock_weiboarc] self.harvester.harvest_seeds() self.assertDictEqual({"weibos": 2}, self.harvester.result.harvest_counter) mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN) self.assertEqual([call(since_id=None)], mock_weiboarc.search_friendships.mock_calls) @patch("weibo_harvester.Weiboarc", autospec=True) def test_incremental_search(self, mock_weiboarc_class): mock_weiboarc = MagicMock(spec=Weiboarc) # Expecting 2 searches. First returns 2 weibos,one is none. Second returns none. mock_weiboarc.search_friendships.side_effect = [(weibo2,), ()] # Return mock_weiboarc when instantiating a weiboarc. mock_weiboarc_class.side_effect = [mock_weiboarc] self.harvester.message["options"] = { # Incremental means that will only retrieve new results. "incremental": True } collection_set_id = self.harvester.message["collection_set"]["id"] self.harvester.state_store.set_state( "weibo_harvester", u"{}.since_id".format(collection_set_id), 3927348724716740 ) self.harvester.harvest_seeds() self.assertDictEqual({"weibos": 1}, self.harvester.result.harvest_counter) mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN) # since_id must be in the mock calls self.assertEqual([call(since_id=3927348724716740)], mock_weiboarc.search_friendships.mock_calls) self.assertNotEqual([call(since_id=None)], mock_weiboarc.search_friendships.mock_calls) @staticmethod def _iter_items(items): # This is useful for mocking out a warc iter iter_items = [] for item in items: iter_items.append(IterItem(None, None, None, None, item)) return iter_items @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()] iter_class.side_effect = [mock_iter] # These are default harvest options self.harvester.extract_web_resources = False self.harvester.extract_images_sizes = [] self.harvester.incremental = False self.harvester.process_warc("test.warc.gz") # The default will not sending web harvest self.assertSetEqual(set(), self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") self.assertEqual(3, self.harvester.result.stats_summary()["weibos"]) # State not set self.assertIsNone(self.harvester.state_store.get_state("weibo_harvester", "test_collection_set.since_id")) @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process_incremental(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()] iter_class.side_effect = [mock_iter] # These are default harvest options self.harvester.extract_web_resources = False self.harvester.extract_images_sizes = [] self.harvester.incremental = True self.harvester.state_store.set_state("weibo_harvester", "test_collection_set.since_id", 3927348724716740) self.harvester.process_warc("test.warc.gz") # The default will not sending web harvest self.assertSetEqual(set(), self.harvester.result.urls_as_set()) iter_class.assert_called_once_with("test.warc.gz") self.assertEqual(3, self.harvester.result.stats_summary()["weibos"]) # State updated self.assertEqual( 3973784090711192, self.harvester.state_store.get_state("weibo_harvester", "test_collection_set.since_id") ) @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process_harvest_options_web(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()] iter_class.side_effect = [mock_iter] # These are default harvest options self.harvester.extract_web_resources = True self.harvester.extract_images_sizes = [] self.harvester.incremental = False self.harvester.process_warc("test.warc.gz") # Testing URL1&URL2 self.assertSetEqual( {"http://t.cn/RqmQ3ko", "http://m.weibo.cn/1618051664/3973767505640890"}, self.harvester.result.urls_as_set(), ) iter_class.assert_called_once_with("test.warc.gz") @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process_harvest_options_media(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()] iter_class.side_effect = [mock_iter] # These are default harvest options self.harvester.extract_web_resources = False self.harvester.extract_images_sizes = ["Large", "Medium", "Thumbnail"] self.harvester.incremental = False self.harvester.process_warc("test.warc.gz") # Testing URL3 photos URLs self.assertSetEqual( { "http://ww2.sinaimg.cn/large/6b23a52bgw1f3pjhhyofnj208p06c3yq.jpg", "http://ww4.sinaimg.cn/large/60718250jw1f3qtzyhai3j20de0vin32.jpg", "http://ww2.sinaimg.cn/bmiddle/6b23a52bgw1f3pjhhyofnj208p06c3yq.jpg", "http://ww4.sinaimg.cn/bmiddle/60718250jw1f3qtzyhai3j20de0vin32.jpg", "http://ww2.sinaimg.cn/thumbnail/6b23a52bgw1f3pjhhyofnj208p06c3yq.jpg", "http://ww4.sinaimg.cn/thumbnail/60718250jw1f3qtzyhai3j20de0vin32.jpg", }, self.harvester.result.urls_as_set(), ) iter_class.assert_called_once_with("test.warc.gz")
class TestWeiboHarvesterVCR(tests.TestCase): def setUp(self): self.working_path = tempfile.mkdtemp() self.harvester = WeiboHarvester(self.working_path) self.harvester.state_store = DictHarvestStateStore() self.harvester.result = HarvestResult() self.harvester.stop_harvest_seeds_event = threading.Event() self.harvester.message = base_timeline_message def tearDown(self): if os.path.exists(self.working_path): shutil.rmtree(self.working_path) @vcr.use_cassette(filter_query_parameters=['access_token']) def test_timeline_vcr(self): self.harvester.message = base_timeline_message self.harvester.harvest_seeds() # check the total number, for new users don't how to check self.assertEqual(self.harvester.result.harvest_counter["weibos"], 181) # check the harvester status self.assertTrue(self.harvester.result.success) @vcr.use_cassette(filter_query_parameters=['access_token']) def test_incremental_timeline_vcr(self): message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True self.harvester.message = message collection_set_id = self.harvester.message["collection_set"]["id"] self.harvester.state_store.set_state( "weibo_harvester", u"{}.since_id".format(collection_set_id), 3935747172100551) self.harvester.harvest_seeds() # Check harvest result self.assertTrue(self.harvester.result.success) # for check the number of get self.assertEqual(self.harvester.result.harvest_counter["weibos"], 5) @vcr.use_cassette(filter_query_parameters=['access_token']) def test_search_topic_vcr(self): self.harvester.message = base_search_message self.harvester.harvest_seeds() # check the total number, one search return 200 self.assertEqual(self.harvester.result.harvest_counter["weibos"], 200) self.assertTrue(self.harvester.result.success) @vcr.use_cassette(filter_query_parameters=['access_token']) def test_search_topic_empty_vcr(self): self.harvester.message = base_search_message self.harvester.harvest_seeds() # check the total number, one search return 0 with empty list self.assertEqual(self.harvester.result.harvest_counter["weibos"], 0) self.assertTrue(self.harvester.result.success) @vcr.use_cassette(filter_query_parameters=['access_token']) def test_incremental_search_topic_vcr(self): message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True self.harvester.message = message query = self.harvester.message["seeds"][0]["token"] self.harvester.state_store.set_state("weibo_harvester", u"{}.since_id".format(query), 4061065610091375) self.harvester.harvest_seeds() # Check harvest result self.assertTrue(self.harvester.result.success) # for check the number, it count as 6 self.assertEqual(self.harvester.result.harvest_counter["weibos"], 6)
class TestWeiboHarvester(tests.TestCase): def setUp(self): self.working_path = tempfile.mkdtemp() self.harvester = WeiboHarvester(self.working_path) self.harvester.state_store = DictHarvestStateStore() self.harvester.result = HarvestResult() self.harvester.stop_harvest_seeds_event = threading.Event() self.harvester.message = base_timeline_message def tearDown(self): if os.path.exists(self.working_path): shutil.rmtree(self.working_path) @patch("weibo_harvester.Weiboarc", autospec=True) def test_search_timeline(self, mock_weiboarc_class): mock_weiboarc = MagicMock(spec=Weiboarc) # Expecting 2 results. First returns 1tweets. Second returns none. mock_weiboarc.search_friendships.side_effect = [(weibo1, weibo2), ()] # Return mock_weiboarc when instantiating a weiboarc. mock_weiboarc_class.side_effect = [mock_weiboarc] self.harvester.message = base_timeline_message self.harvester.harvest_seeds() self.assertDictEqual({"weibos": 2}, self.harvester.result.harvest_counter) mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN) self.assertEqual([call(since_id=None)], mock_weiboarc.search_friendships.mock_calls) @patch("weibo_harvester.Weiboarc", autospec=True) def test_incremental_search_timeline(self, mock_weiboarc_class): mock_weiboarc = MagicMock(spec=Weiboarc) # Expecting 2 searches. First returns 2 weibos,one is none. Second returns none. mock_weiboarc.search_friendships.side_effect = [(weibo2, ), ()] # Return mock_weiboarc when instantiating a weiboarc. mock_weiboarc_class.side_effect = [mock_weiboarc] message = copy.deepcopy(base_timeline_message) message["options"]["incremental"] = True self.harvester.message = message collection_set_id = self.harvester.message["collection_set"]["id"] self.harvester.state_store.set_state( "weibo_harvester", u"{}.since_id".format(collection_set_id), 3927348724716740) self.harvester.harvest_seeds() self.assertDictEqual({"weibos": 1}, self.harvester.result.harvest_counter) mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN) # since_id must be in the mock calls self.assertEqual([call(since_id=3927348724716740)], mock_weiboarc.search_friendships.mock_calls) self.assertNotEqual([call(since_id=None)], mock_weiboarc.search_friendships.mock_calls) @patch("weibo_harvester.Weiboarc", autospec=True) def test_search_topic(self, mock_weiboarc_class): mock_weiboarc = MagicMock(spec=Weiboarc) # search_topic Expecting 2 results. First returns 1tweets. Second returns none. mock_weiboarc.search_topic.side_effect = [(weibo6, weibo7), ()] # Return mock_weiboarc when instantiating a weiboarc. mock_weiboarc_class.side_effect = [mock_weiboarc] self.harvester.message = base_search_message self.harvester.harvest_seeds() query = self.harvester.message["seeds"][0]["token"] self.assertDictEqual({"weibos": 2}, self.harvester.result.harvest_counter) mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN) self.assertEqual([call(query, since_id=None)], mock_weiboarc.search_topic.mock_calls) @patch("weibo_harvester.Weiboarc", autospec=True) def test_incremental_search_topic(self, mock_weiboarc_class): mock_weiboarc = MagicMock(spec=Weiboarc) # search_topic Expecting 2 searches. First returns 1 weibos mock_weiboarc.search_topic.side_effect = [(weibo7, ), ()] # Return mock_weiboarc when instantiating a weiboarc. mock_weiboarc_class.side_effect = [mock_weiboarc] message = copy.deepcopy(base_search_message) message["options"]["incremental"] = True self.harvester.message = message query = self.harvester.message["seeds"][0]["token"] self.harvester.state_store.set_state("weibo_harvester", u"{}.since_id".format(query), 4060927646547531) self.harvester.harvest_seeds() mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN) self.assertEqual([call(query, since_id=4060927646547531)], mock_weiboarc.search_topic.mock_calls) self.assertDictEqual({"weibos": 1}, self.harvester.result.harvest_counter) @staticmethod def _iter_items(items): # This is useful for mocking out a warc iter iter_items = [] for item in items: iter_items.append(IterItem(None, None, None, None, item)) return iter_items @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process_timeline(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([weibo3, weibo4, weibo5]).__iter__() ] iter_class.side_effect = [mock_iter] # These are default harvest options self.harvester.incremental = False self.harvester.message = base_timeline_message self.harvester.process_warc("test.warc.gz") iter_class.assert_called_once_with("test.warc.gz") self.assertEqual(3, self.harvester.result.stats_summary()["weibos"]) # State not set self.assertIsNone( self.harvester.state_store.get_state( "weibo_harvester", "test_collection_set.since_id")) @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process_timeline_incremental(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([weibo3, weibo4, weibo5]).__iter__() ] iter_class.side_effect = [mock_iter] # These are default harvest options self.harvester.incremental = True self.harvester.state_store.set_state("weibo_harvester", "test_collection_set.since_id", 3927348724716740) self.harvester.message = base_timeline_message self.harvester.process_warc("test.warc.gz") iter_class.assert_called_once_with("test.warc.gz") self.assertEqual(3, self.harvester.result.stats_summary()["weibos"]) # State updated self.assertEqual( 3973784090711192, self.harvester.state_store.get_state( "weibo_harvester", "test_collection_set.since_id")) @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process_search_topic(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([weibo6, weibo7]).__iter__() ] iter_class.side_effect = [mock_iter] self.harvester.message = base_search_message self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"weibos": 2}, self.harvester.result.stats_summary()) iter_class.assert_called_once_with("test.warc.gz") # State updated query = self.harvester.message["seeds"][0]["token"] self.assertEqual( None, self.harvester.state_store.get_state("weibo_harvester", u"{}.since_id".format(query))) @patch("weibo_harvester.WeiboWarcIter", autospec=True) def test_process_search_topic_incremental(self, iter_class): mock_iter = MagicMock(spec=WeiboWarcIter) mock_iter.__iter__.side_effect = [ self._iter_items([weibo6, weibo7]).__iter__() ] iter_class.side_effect = [mock_iter] self.harvester.message = base_search_message self.harvester.incremental = True # check the result query = self.harvester.message["seeds"][0]["token"] self.harvester.state_store.set_state("weibo_harvester", u"{}.since_id".format(query), 4060927646547530) self.harvester.process_warc("test.warc.gz") self.assertDictEqual({"weibos": 2}, self.harvester.result.stats_summary()) iter_class.assert_called_once_with("test.warc.gz") # State updated self.assertEqual( 4060928330955796, self.harvester.state_store.get_state("weibo_harvester", u"{}.since_id".format(query)))