def test_add_normal_urls(self, mc): '''testing insertion of new links''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) dm = DocumentMetadata("http://random-url.com") dm.depth = 1 dm.links = [ "http://www.randomurl1.it", "http://www.randomurl2.it", "http://www.randomurl3.it", "http://www.randomurl4.it", "http://www.randomurl5.it", "http://www.randomurl6.it", "http://www.randomurl7.it", ] all_url_lenght = len(dm.links) # adding all the links found in the document to the normal list qm.add_normal_urls(dm) # checking if the urls are there: stored = qm.normal_store.getall() self.assertEqual(len(stored), all_url_lenght) links_set = set(dm.links) for s in stored: self.assertTrue(s[1] in links_set) # depth should be increased by 1 self.assertEqual(s[0], dm.depth + 1) # checking that seen is still empty for u in dm.links: self.assertEqual(qm.seen.is_new(u), True) # adding a duplicate with same depth # entry should be replaced. dm.links = ["http://www.randomurl1.it"] qm.add_normal_urls(dm) stored = qm.normal_store.getall() self.assertEqual(len(stored), all_url_lenght) # adding a duplicate with different depth # we should have duplicate entries. dm.depth = 3 dm.links = ["http://www.randomurl1.it"] qm.add_normal_urls(dm) stored = qm.normal_store.getall() self.assertEqual(len(stored), all_url_lenght + 1)
def test_add_normal_urls_some_seen(self, mc): ''' Tesiting insertion of some new and old links. ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) ############################ # insert some urls into seen dm_seen_1 = DocumentMetadata("http://www.randomurl1.it") # this two are cosidered the same page with different urls dm_seen_1.alternatives = [ "http://www.randomurl1.it", "http://www.randomurl3.it", ] dm_seen_1.dhash = 12345 dm_seen_2 = DocumentMetadata("http://www.randomurl4.it") dm_seen_2.alternatives = [ "http://www.randomurl4.it", ] dm_seen_2.dhash = 98765 already_seen_urls = set(dm_seen_1.alternatives).union( dm_seen_2.alternatives) qm.seen.add(dm_seen_1) qm.seen.add(dm_seen_2) ############################################ # testing add normal url with some seen urls dm = DocumentMetadata("http://random-url.com") dm.depth = 1 dm.links = [ "http://www.randomurl1.it", "http://www.randomurl2.it", "http://www.randomurl3.it", "http://www.randomurl4.it", "http://www.randomurl5.it", "http://www.randomurl6.it", "http://www.randomurl7.it", ] # adding all the links found in the document to the normal list qm.add_normal_urls(dm) # checking if the urls are there (all except 3 because already seen): links_set = set(dm.links).difference(already_seen_urls) stored = qm.normal_store.getall() self.assertEqual(len(stored), len(links_set)) # count for this urls should be 1 for i in dm_seen_1.alternatives: data = qm.seen.get(i) # this is 3 becouse we inserted one in seen + # we tried to insert in normal www.randomurl1.com + # we tried to insert in normal www.randomurl3.com self.assertEqual(data["count"], 3) # adding a duplicate that is already in seen. # should not be added in normal list, but counters of # all alternatives shouls be updated (+1) dm.links = [ dm.links[0], ] qm.add_normal_urls(dm) stored = qm.normal_store.getall() for i in dm_seen_1.alternatives: data = qm.seen.get(i) self.assertEqual(data["count"], 4) # adding again with different depth should not # change the behaviour dm.depth = 3 dm.links = [ dm.links[0], ] qm.add_normal_urls(dm) for i in dm_seen_1.alternatives: # should be 2 because I inserted 2 times data = qm.seen.get(i) self.assertEqual(data["count"], 5)