def test_is_new(self, mc): mc.return_value = mongomock.MongoClient() sm = SeenManager("test", "host", 0, "db") dmeta = DocumentMetadata("http://www.google.com") dmeta.alternatives = [ "http://www.google.com", "http://www.google2.com/", "https://www.google3.com", ] dmeta.dhash = 2413242 other_urls = [ "www.test.com", "www.other.com", ] # adding urls sm.add(dmeta) for u in dmeta.alternatives: self.assertFalse(sm.is_new(canonize(u))) for u in other_urls: self.assertTrue(sm.is_new(canonize(u)))
def test_remove_seen(self, mc): ''' Test delete some seen ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) ############################ # insert some urls into seen dm_seen_1 = DocumentMetadata("http://www.randomurl1.it") # this two are cosidered the same page with different urls dm_seen_1.alternatives = [ "http://www.randomurl1.it", "http://www.randomurl3.it", ] dm_seen_1.dhash = 12345 dm_seen_2 = DocumentMetadata("http://www.randomurl4.it") dm_seen_2.alternatives = [ "http://www.randomurl4.it", ] dm_seen_2.dhash = 98765 already_seen_urls = set(dm_seen_1.alternatives).union( dm_seen_2.alternatives) qm.seen.add(dm_seen_1) qm.seen.add(dm_seen_2) self.assertTrue(qm.seen.get("www.randomurl1.it") is not None) self.assertTrue(qm.seen.get("www.randomurl3.it") is not None) self.assertTrue(qm.seen.get("www.randomurl4.it") is not None) # deleting url should remove all alternatives qm.remove_seen("www.randomurl1.it") self.assertTrue(qm.seen.get("www.randomurl1.it") is None) self.assertTrue(qm.seen.get("www.randomurl3.it") is None) self.assertTrue(qm.seen.get("www.randomurl4.it") is not None)
def test_update(self, mc): mc.return_value = mongomock.MongoClient() sm = SeenManager("test", "host", 0, "db") dmeta = DocumentMetadata("http://www.google.com?q=test") dmeta.alternatives = [ "http://www.google.com?q=test", "http://www.google2.com/", "https://www.google3.com", ] dmeta.dhash = 2413242 dmeta2 = DocumentMetadata("http://www.google2.com") dmeta2.alternatives = [ "http://www.google2.com", "https://www.google3.com", ] dmeta2.dhash = 12121212 # adding urls sm.add(dmeta) sm.add(dmeta2) output_alternatives = dmeta.alternatives + dmeta2.alternatives output_alternatives = list( set(canonize(i) for i in output_alternatives)) # checking presence and checking not double anonization for i in output_alternatives: self.assertTrue(i in sm.store) for i, v in enumerate(sm.store.get(i)['alternatives']): self.assertEqual(v, output_alternatives[i])
def setUp(self, mc): super(TestNewsQueueRescheduling, self).setUp() mc.return_value = mongomock.MongoClient() self.qm = QueueManager("queues-names", START_DELAY, CONFIGURATION_NEWS) ############################ # insert some urls into seen dm_seen_1 = DocumentMetadata("http://www.randomurl1.it") # this two are cosidered the same page with different urls dm_seen_1.alternatives = [ "http://www.randomurl1.it", "http://www.randomurl3.it", ] dm_seen_1.dhash = 12345 dm_seen_2 = DocumentMetadata("http://www.randomurl4.it") dm_seen_2.alternatives = [ "http://www.randomurl4.it", ] dm_seen_2.dhash = 98765 already_seen_urls = set(dm_seen_1.alternatives).union( dm_seen_2.alternatives) self.qm.seen.add(dm_seen_1) self.qm.seen.add(dm_seen_2) self.qm.seen.incr_n( dm_seen_1.url) # increase counter to check it later counter = self.qm.seen.get(dm_seen_1.url).get("count") self.assertEqual(counter, 2)
def test_add_and_delete(self, mc): mc.return_value = mongomock.MongoClient() sm = SeenManager("test", "host", 0, "db") dmeta = DocumentMetadata("http://www.google.com") dmeta.alternatives = [ "http://www.google.com", "http://www.google2.com/", "https://www.google3.com", ] dmeta.dhash = 2413242 other_urls = [ "www.prova.com", "www.other.com", ] # adding urls sm.add(dmeta) # tring removing not present urls for o in other_urls: sm.delete(o) # checking presence for i in dmeta.alternatives: self.assertTrue(canonize(i) in sm.store) self.assertEqual(len(dmeta.alternatives), len(sm.store)) # checking not precence for u in other_urls: self.assertFalse(canonize(u) in sm.store) # checking correctness for i in dmeta.alternatives: data = sm.store.get(canonize(i)) self.assertEqual(data["count"], 1) self.assertEqual(data["page_hash"], dmeta.dhash) # deleting alternatives sm.delete(dmeta.alternatives[0]) # checking empty db for i in dmeta.alternatives: self.assertFalse(canonize(i) in sm.store) self.assertEqual(0, len(sm.store))
def test_incr_n(self, mc): mc.return_value = mongomock.MongoClient() sm = SeenManager("test", "host", 0, "db") dmeta = DocumentMetadata("http://www.google.com") dmeta.alternatives = [ "http://www.google.com", "http://www.google2.com/", "https://www.google3.com", ] dmeta.dhash = 2413242 # adding urls sm.add(dmeta) # increase counters sm.incr_n(dmeta.alternatives[0]) for i in dmeta.alternatives: data = sm.store.get(canonize(i)) self.assertEqual(data["count"], 2)
def test_is_changed(self, mc): mc.return_value = mongomock.MongoClient() sm = SeenManager("test", "host", 0, "db") dmeta = DocumentMetadata("http://www.google.com") dmeta.alternatives = [ "http://www.google.com", "http://www.google2.com/", "https://www.google3.com", ] dmeta.dhash = 2413242 # adding urls sm.add(dmeta) for u in dmeta.alternatives: self.assertFalse(sm.is_changed(u, dmeta.dhash)) for u in dmeta.alternatives: self.assertFalse(sm.is_changed(u, dmeta.dhash + 2)) for u in dmeta.alternatives: self.assertTrue(sm.is_changed(u, dmeta.dhash + 3))
def test_add_normal_urls(self, mc): '''testing insertion of new links''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) dm = DocumentMetadata("http://random-url.com") dm.depth = 1 dm.links = [ "http://www.randomurl1.it", "http://www.randomurl2.it", "http://www.randomurl3.it", "http://www.randomurl4.it", "http://www.randomurl5.it", "http://www.randomurl6.it", "http://www.randomurl7.it", ] all_url_lenght = len(dm.links) # adding all the links found in the document to the normal list qm.add_normal_urls(dm) # checking if the urls are there: stored = qm.normal_store.getall() self.assertEqual(len(stored), all_url_lenght) links_set = set(dm.links) for s in stored: self.assertTrue(s[1] in links_set) # depth should be increased by 1 self.assertEqual(s[0], dm.depth + 1) # checking that seen is still empty for u in dm.links: self.assertEqual(qm.seen.is_new(u), True) # adding a duplicate with same depth # entry should be replaced. dm.links = ["http://www.randomurl1.it"] qm.add_normal_urls(dm) stored = qm.normal_store.getall() self.assertEqual(len(stored), all_url_lenght) # adding a duplicate with different depth # we should have duplicate entries. dm.depth = 3 dm.links = ["http://www.randomurl1.it"] qm.add_normal_urls(dm) stored = qm.normal_store.getall() self.assertEqual(len(stored), all_url_lenght + 1)
def test_pop_ordering(self, mc): ''' Test adding url to priority, normal and refetch and checking the ordering of popping is correct ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) # inserting a priority url. urls = [ "www.daniele.com", ] qm.init_priority_list(urls) # inserting a normal url. burls = [ { "url": "www.daniele1.com", "depth": "2" }, ] qm.add_bootstrap_urls(burls) # inserting a refetch url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] qm.add_seen_and_reschedule(dm) # make sure all the inserted url are ready to be popped with mock.patch("time.time", mock_time): doc = qm.pop() # first one from priority self.assertEqual(doc.depth, 0) self.assertEqual(doc.source, Source.priority) doc = qm.pop() # second one from normal self.assertEqual(doc.source, Source.normal) doc = qm.pop() # third from refetching self.assertEqual(doc.source, Source.refetch)
def test_fetcher(self, session): for i in self.input_data(): url, content, status = i.split("\t") status = int(status) dm = DocumentMetadata(url) mr = MockResponse(content, status) session.return_value = mr new_dm = fetcher.fetch({}, dm, 1) self.assertEqual(dm.url, url) if status == 200: self.assertEqual(dm.status, 0) self.assertEqual(new_dm.status, 0) self.assertEqual(new_dm.response.content, content) elif status == 301: self.assertEqual(dm.status, fetcher.Status.SkipUrl) elif status == 0: self.assertEqual(dm.status, fetcher.Status.ConnectionError) elif status >= 500 and status < 510: self.assertEqual(dm.status, fetcher.Status.SkipUrl) else: self.assertEqual(dm.status, fetcher.Status.GenericError)
def test_reschedule_priority(self): ############################################ # inserting a priority url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.priority dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, START_DELAY) self.assertEqual(refetching_data.source, Source.priority)
def test_reschedule_different_content(self): ############################################ # this one is in seen with a different hash and # not taken from priority queue # I expect: halving the delay and set seen counter to 1 dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 1936 dm.source = Source.normal dm.delay = 20 dm.alternatives = ["http://www.randomurl1.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, dm.delay / 2) self.assertEqual(refetching_data.source, Source.refetch)
def test_reschedule4(self): ############################################ # inserting a new url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, CONFIGURATION["queues"]["refetching-delay"]) self.assertEqual(refetching_data.source, Source.refetch)
def test_reschedule3(self): ############################################ # as before but with a small delay. # cheking delay not changing dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.delay = 500 dm.alternatives = ["http://www.randomurl1.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, dm.delay) self.assertEqual(refetching_data.source, Source.refetch)
def test_reschedule_samecontent_lastdelay(self): ############################################ # testing rescheduling some urls # this one is in seen with the same hash and # not taken from priority queue # I expect: doublig the delay and set seen counter to 1 dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 12345 dm.source = Source.normal dm.delay = 40 # alternatives contains always at least one url. dm.alternatives = ["http://www.randomurl1.it"] # we want to check that former alternatives are also correctly # updated even if new alternatives field is different. alternatives = self.qm.seen.get(dm.url).get("alternatives") self.assertNotEqual(len(dm.alternatives), len(alternatives)) self.qm.add_seen_and_reschedule(dm) # check all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) # check updated all the alternatives for urls in alternatives: counter = self.qm.seen.get(urls).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.url, "") self.assertEqual(refetching_data.source, Source.unknown)
def pop(self): """Return the next document to fetch""" document_metadata = DocumentMetadata() item = self.priority_store.pop(int(time.time())) if item: logging.debug("Get priority:" + str(item[1])) document_metadata.url = item[1] document_metadata.depth = item[3] document_metadata.delay = item[2] document_metadata.source = Source.priority else: while not item: item = self.normal_store.pop() if not item: break # the following check is needed because urls are stored in seen # after seeing them # so we can have multiple identical url in normal list. # and we do not want to have multiple same urls in refetching list if not self.seen.is_new(item[1]): item = None if item: # In case of network error I repush url on normal queue # just to not loose them. So it is possible we have # something already seen here. # It is not a problem to refetch this cases logging.debug("Get normal:" + str(item[1])) document_metadata.url = item[1] document_metadata.depth = item[0] document_metadata.delay = 0 document_metadata.source = Source.normal else: item = self.refetch_store.pop(int(time.time())) if item: logging.debug("Get Refetch:" + str(item[1])) document_metadata.url = item[1] document_metadata.depth = item[3] document_metadata.delay = item[2] document_metadata.source = Source.refetch return document_metadata
def test_add_normal_urls_some_seen(self, mc): ''' Tesiting insertion of some new and old links. ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) ############################ # insert some urls into seen dm_seen_1 = DocumentMetadata("http://www.randomurl1.it") # this two are cosidered the same page with different urls dm_seen_1.alternatives = [ "http://www.randomurl1.it", "http://www.randomurl3.it", ] dm_seen_1.dhash = 12345 dm_seen_2 = DocumentMetadata("http://www.randomurl4.it") dm_seen_2.alternatives = [ "http://www.randomurl4.it", ] dm_seen_2.dhash = 98765 already_seen_urls = set(dm_seen_1.alternatives).union( dm_seen_2.alternatives) qm.seen.add(dm_seen_1) qm.seen.add(dm_seen_2) ############################################ # testing add normal url with some seen urls dm = DocumentMetadata("http://random-url.com") dm.depth = 1 dm.links = [ "http://www.randomurl1.it", "http://www.randomurl2.it", "http://www.randomurl3.it", "http://www.randomurl4.it", "http://www.randomurl5.it", "http://www.randomurl6.it", "http://www.randomurl7.it", ] # adding all the links found in the document to the normal list qm.add_normal_urls(dm) # checking if the urls are there (all except 3 because already seen): links_set = set(dm.links).difference(already_seen_urls) stored = qm.normal_store.getall() self.assertEqual(len(stored), len(links_set)) # count for this urls should be 1 for i in dm_seen_1.alternatives: data = qm.seen.get(i) # this is 3 becouse we inserted one in seen + # we tried to insert in normal www.randomurl1.com + # we tried to insert in normal www.randomurl3.com self.assertEqual(data["count"], 3) # adding a duplicate that is already in seen. # should not be added in normal list, but counters of # all alternatives shouls be updated (+1) dm.links = [ dm.links[0], ] qm.add_normal_urls(dm) stored = qm.normal_store.getall() for i in dm_seen_1.alternatives: data = qm.seen.get(i) self.assertEqual(data["count"], 4) # adding again with different depth should not # change the behaviour dm.depth = 3 dm.links = [ dm.links[0], ] qm.add_normal_urls(dm) for i in dm_seen_1.alternatives: # should be 2 because I inserted 2 times data = qm.seen.get(i) self.assertEqual(data["count"], 5)