示例#1
0
    def test_add_normal_urls(self, mc):
        '''testing insertion of new links'''
        mc.return_value = mongomock.MongoClient()
        qm = QueueManager("queues-names", START_DELAY, CONFIGURATION)
        dm = DocumentMetadata("http://random-url.com")
        dm.depth = 1
        dm.links = [
            "http://www.randomurl1.it",
            "http://www.randomurl2.it",
            "http://www.randomurl3.it",
            "http://www.randomurl4.it",
            "http://www.randomurl5.it",
            "http://www.randomurl6.it",
            "http://www.randomurl7.it",
        ]
        all_url_lenght = len(dm.links)
        # adding all the links found in the document to the normal list
        qm.add_normal_urls(dm)

        # checking if the urls are there:
        stored = qm.normal_store.getall()
        self.assertEqual(len(stored), all_url_lenght)

        links_set = set(dm.links)
        for s in stored:
            self.assertTrue(s[1] in links_set)
            # depth should be increased by 1
            self.assertEqual(s[0], dm.depth + 1)

        # checking that seen is still empty
        for u in dm.links:
            self.assertEqual(qm.seen.is_new(u), True)

        # adding a duplicate with same depth
        # entry should be replaced.
        dm.links = ["http://www.randomurl1.it"]
        qm.add_normal_urls(dm)
        stored = qm.normal_store.getall()
        self.assertEqual(len(stored), all_url_lenght)

        # adding a duplicate with different depth
        # we should have duplicate entries.
        dm.depth = 3
        dm.links = ["http://www.randomurl1.it"]
        qm.add_normal_urls(dm)
        stored = qm.normal_store.getall()
        self.assertEqual(len(stored), all_url_lenght + 1)
示例#2
0
    def test_add_normal_urls_some_seen(self, mc):
        '''
        Tesiting insertion of some new and old links.
        '''
        mc.return_value = mongomock.MongoClient()
        qm = QueueManager("queues-names", START_DELAY, CONFIGURATION)

        ############################
        # insert some urls into seen
        dm_seen_1 = DocumentMetadata("http://www.randomurl1.it")
        # this two are cosidered the same page with different urls
        dm_seen_1.alternatives = [
            "http://www.randomurl1.it",
            "http://www.randomurl3.it",
        ]
        dm_seen_1.dhash = 12345
        dm_seen_2 = DocumentMetadata("http://www.randomurl4.it")
        dm_seen_2.alternatives = [
            "http://www.randomurl4.it",
        ]
        dm_seen_2.dhash = 98765
        already_seen_urls = set(dm_seen_1.alternatives).union(
            dm_seen_2.alternatives)
        qm.seen.add(dm_seen_1)
        qm.seen.add(dm_seen_2)

        ############################################
        # testing add normal url with some seen urls
        dm = DocumentMetadata("http://random-url.com")
        dm.depth = 1
        dm.links = [
            "http://www.randomurl1.it",
            "http://www.randomurl2.it",
            "http://www.randomurl3.it",
            "http://www.randomurl4.it",
            "http://www.randomurl5.it",
            "http://www.randomurl6.it",
            "http://www.randomurl7.it",
        ]

        # adding all the links found in the document to the normal list
        qm.add_normal_urls(dm)

        # checking if the urls are there (all except 3 because already seen):
        links_set = set(dm.links).difference(already_seen_urls)
        stored = qm.normal_store.getall()
        self.assertEqual(len(stored), len(links_set))

        # count for this urls should be 1
        for i in dm_seen_1.alternatives:
            data = qm.seen.get(i)
            # this is 3 becouse we inserted one in seen +
            # we tried to insert in normal www.randomurl1.com  +
            # we tried to insert in normal www.randomurl3.com
            self.assertEqual(data["count"], 3)

        # adding a duplicate that is already in seen.
        # should not be added in normal list, but counters of
        # all alternatives shouls be updated (+1)
        dm.links = [
            dm.links[0],
        ]
        qm.add_normal_urls(dm)
        stored = qm.normal_store.getall()
        for i in dm_seen_1.alternatives:
            data = qm.seen.get(i)
            self.assertEqual(data["count"], 4)

        # adding again with different depth should not
        # change the behaviour
        dm.depth = 3
        dm.links = [
            dm.links[0],
        ]
        qm.add_normal_urls(dm)

        for i in dm_seen_1.alternatives:
            # should be 2 because I inserted 2 times
            data = qm.seen.get(i)
            self.assertEqual(data["count"], 5)