def test_add_priority_urls(self, mc): ''' Test adding some priority url ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) urls = [ "www.daniele.com", "www.daniele1.com", "www.daniele2.com", "www.daniele3.com", ] qm.init_priority_list(urls) count = 0 while True: doc = qm.pop() if not doc.url: break self.assertEqual(doc.depth, 0) self.assertEqual(doc.source, Source.priority) count += 1 self.assertEqual(count, len(urls))
def test_pop_ordering(self, mc): ''' Test adding url to priority, normal and refetch and checking the ordering of popping is correct ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) # inserting a priority url. urls = [ "www.daniele.com", ] qm.init_priority_list(urls) # inserting a normal url. burls = [ { "url": "www.daniele1.com", "depth": "2" }, ] qm.add_bootstrap_urls(burls) # inserting a refetch url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] qm.add_seen_and_reschedule(dm) # make sure all the inserted url are ready to be popped with mock.patch("time.time", mock_time): doc = qm.pop() # first one from priority self.assertEqual(doc.depth, 0) self.assertEqual(doc.source, Source.priority) doc = qm.pop() # second one from normal self.assertEqual(doc.source, Source.normal) doc = qm.pop() # third from refetching self.assertEqual(doc.source, Source.refetch)
def test_add_bootstrap(self, mc): ''' Test adding some bootstrap urls ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) burls = [ { "url": "www.daniele.com", "depth": "3" }, { "url": "www.daniele1.com", "depth": "2" }, { "url": "www.daniele2.com", "depth": "1" }, { "url": "www.daniele3.com", "depth": "3" }, { "url": "www.daniele4.com", "depth": "5" }, { "url": "www.daniele5.com", "depth": "1" }, ] max_initial_depth = 5 qm.add_bootstrap_urls(burls) # extracting from normal list should return urls in order of depth count = 0 max_depth = 0 while True: doc = qm.pop() if not doc.url: break self.assertTrue(doc.depth >= max_depth) max_depth = doc.depth count += 1 self.assertEqual(count, len(burls)) # when adding bootstrap, is added 2 to the depth (it is hardcoded for now) self.assertEqual(max_depth, max_initial_depth + 2)
class Crawler(): """ Given some configuration retrieve urls and its links. One crawler use the configuration of a spider and the global configuration. Based on the conf downlads urls and store the content in the appropriate collection. """ def __init__(self, spider, cfg): self.logger = logging.getLogger(spider.name) self.sleep = threading.Event() self.spider = spider redis_config = cfg["redis"] mongodb_config = cfg["mongodb"] self.queue = QueueManager(spider.name, spider.restart_delay, cfg) # setup output method output = cfg.get('output') if output: if output.get("type") == "json": self.documentStore = ds.JsonStore(output.get("filename")) elif output.get("type") == "redis": self.documentStore = ds.RedisStore(spider.name, redis_config['host'], redis_config['port'], redis_config['db']) elif output.get("type") == "mongodb": self.documentStore = ds.MongoDBStore(spider.name, mongodb_config['host'], mongodb_config['port'], mongodb_config['db']) else: self.documentStore = ds.StandardStore() def start(self): """ Start the crawling phase. The job continues until a sigterm is cought. """ # starting urls end up in priority self.queue.init_priority_list(self.spider.start_urls) self.queue.add_bootstrap_urls(self.spider.urllist) while not GracefulKiller.kill_now: dmeta = self.queue.pop() dmeta.spider = self.spider.name if dmeta.url: # in case of changing spider filters it is better to recheck nurl, toremove = self.spider.check_and_normalize(dmeta.url) if toremove: self.queue.remove_seen(dmeta.url) # INFO: in case of normalization we want to fetch the url # but we want to discard other cases. if nurl == dmeta.url: continue dmeta.url = nurl dmeta.alternatives = [nurl] dmeta = fetcher.fetch(self.spider.headers, dmeta) if dmeta.status == fetcher.Status.ConnectionError: # CHECK: check if this is still correct self.queue.add_seen_and_reschedule(dmeta) elif dmeta.response: r_url = self.spider.normalize_url(dmeta.response.url) dmeta.alternatives.append(r_url) document, dmeta = self.spider.parse(dmeta) self.queue.add_normal_urls(dmeta) # INFO: in case of status != 200 previous data will not be overwrited self.documentStore.store(document) self.queue.add_seen_and_reschedule(dmeta) self.sleep.wait(self.spider.delay)
class TestNewsQueueRescheduling(BaseTestClass): @mock.patch('redis.StrictRedis', mock_strict_redis_client) @mock.patch('pymongo.MongoClient') def setUp(self, mc): super(TestNewsQueueRescheduling, self).setUp() mc.return_value = mongomock.MongoClient() self.qm = QueueManager("queues-names", START_DELAY, CONFIGURATION_NEWS) ############################ # insert some urls into seen dm_seen_1 = DocumentMetadata("http://www.randomurl1.it") # this two are cosidered the same page with different urls dm_seen_1.alternatives = [ "http://www.randomurl1.it", "http://www.randomurl3.it", ] dm_seen_1.dhash = 12345 dm_seen_2 = DocumentMetadata("http://www.randomurl4.it") dm_seen_2.alternatives = [ "http://www.randomurl4.it", ] dm_seen_2.dhash = 98765 already_seen_urls = set(dm_seen_1.alternatives).union( dm_seen_2.alternatives) self.qm.seen.add(dm_seen_1) self.qm.seen.add(dm_seen_2) self.qm.seen.incr_n( dm_seen_1.url) # increase counter to check it later counter = self.qm.seen.get(dm_seen_1.url).get("count") self.assertEqual(counter, 2) @freeze_time(CURRENT_TIME) def test_reschedule_samecontent(self): ############################################ # testing rescheduling some urls # this one is in seen with the same hash and # not taken from priority queue # I expect: doublig the delay and set seen counter to 1 dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 12345 dm.source = Source.normal dm.delay = 10 # alternatives contains always at least one url. dm.alternatives = ["http://www.randomurl1.it"] # we want to check that former alternatives are also correctly # updated even if new alternatives field is different. alternatives = self.qm.seen.get(dm.url).get("alternatives") self.assertNotEqual(len(dm.alternatives), len(alternatives)) self.qm.add_seen_and_reschedule(dm) # check all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) # check updated all the alternatives for urls in alternatives: counter = self.qm.seen.get(urls).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, dm.delay * 2) self.assertEqual(refetching_data.source, Source.refetch) @freeze_time(CURRENT_TIME) def test_reschedule_samecontent_lastdelay(self): ############################################ # testing rescheduling some urls # this one is in seen with the same hash and # not taken from priority queue # I expect: doublig the delay and set seen counter to 1 dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 12345 dm.source = Source.normal dm.delay = 40 # alternatives contains always at least one url. dm.alternatives = ["http://www.randomurl1.it"] # we want to check that former alternatives are also correctly # updated even if new alternatives field is different. alternatives = self.qm.seen.get(dm.url).get("alternatives") self.assertNotEqual(len(dm.alternatives), len(alternatives)) self.qm.add_seen_and_reschedule(dm) # check all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) # check updated all the alternatives for urls in alternatives: counter = self.qm.seen.get(urls).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.url, "") self.assertEqual(refetching_data.source, Source.unknown) @freeze_time(CURRENT_TIME) def test_reschedule_different_content(self): ############################################ # this one is in seen with a different hash and # not taken from priority queue # I expect: halving the delay and set seen counter to 1 dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 1936 dm.source = Source.normal dm.delay = 20 dm.alternatives = ["http://www.randomurl1.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, dm.delay / 2) self.assertEqual(refetching_data.source, Source.refetch) @freeze_time(CURRENT_TIME) def test_reschedule_newurl(self): ############################################ # inserting a new url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.alternatives = ["http://www.randomurl8.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, TWO_HOURS) self.assertEqual(refetching_data.source, Source.refetch) @freeze_time(CURRENT_TIME) def test_reschedule_priority(self): ############################################ # inserting a priority url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.priority dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, START_DELAY) self.assertEqual(refetching_data.source, Source.priority)