示例#1
0
    def pop(self):
        """Return the next document to fetch"""
        document_metadata = DocumentMetadata()
        item = self.priority_store.pop(int(time.time()))
        if item:
            logging.debug("Get priority:" + str(item[1]))
            document_metadata.url = item[1]
            document_metadata.depth = item[3]
            document_metadata.delay = item[2]
            document_metadata.source = Source.priority
        else:
            while not item:
                item = self.normal_store.pop()
                if not item:
                    break
                # the following check is needed because urls are stored in seen
                # after seeing them
                # so we can have multiple identical url in normal list.
                # and we do not want to have multiple same urls in refetching list
                if not self.seen.is_new(item[1]):
                    item = None
            if item:
                # In case of network error I repush url on normal queue
                # just to not loose them. So it is possible we have
                # something already seen here.
                # It is not a problem to refetch this cases
                logging.debug("Get normal:" + str(item[1]))
                document_metadata.url = item[1]
                document_metadata.depth = item[0]
                document_metadata.delay = 0
                document_metadata.source = Source.normal
            else:
                item = self.refetch_store.pop(int(time.time()))
                if item:
                    logging.debug("Get Refetch:" + str(item[1]))
                    document_metadata.url = item[1]
                    document_metadata.depth = item[3]
                    document_metadata.delay = item[2]
                    document_metadata.source = Source.refetch

        return document_metadata
示例#2
0
    def test_pop_ordering(self, mc):
        '''
        Test adding url to priority, normal and refetch
        and checking the ordering of popping is correct
        '''
        mc.return_value = mongomock.MongoClient()
        qm = QueueManager("queues-names", START_DELAY, CONFIGURATION)

        # inserting a priority url.
        urls = [
            "www.daniele.com",
        ]
        qm.init_priority_list(urls)

        # inserting a normal url.
        burls = [
            {
                "url": "www.daniele1.com",
                "depth": "2"
            },
        ]
        qm.add_bootstrap_urls(burls)

        # inserting a refetch url
        dm = DocumentMetadata("http://www.randomurl8.it")
        dm.depth = 1
        dm.dhash = 121212
        dm.source = Source.normal
        dm.delay = 500
        dm.alternatives = ["http://www.randomurl8.it"]
        qm.add_seen_and_reschedule(dm)

        # make sure all the inserted url are ready to be popped
        with mock.patch("time.time", mock_time):
            doc = qm.pop()
            # first one from priority
            self.assertEqual(doc.depth, 0)
            self.assertEqual(doc.source, Source.priority)

            doc = qm.pop()
            # second one from normal
            self.assertEqual(doc.source, Source.normal)

            doc = qm.pop()
            # third from refetching
            self.assertEqual(doc.source, Source.refetch)
示例#3
0
    def test_reschedule_newurl(self):
        ############################################
        # inserting a new url
        dm = DocumentMetadata("http://www.randomurl8.it")
        dm.depth = 1
        dm.dhash = 121212
        dm.source = Source.normal
        dm.alternatives = ["http://www.randomurl8.it"]

        self.qm.add_seen_and_reschedule(dm)

        # checking all the parameters
        counter = self.qm.seen.get(dm.url).get("count")
        dhash = self.qm.seen.get(dm.url).get("page_hash")
        self.assertEqual(counter, 1)
        self.assertEqual(dhash, dm.dhash)
        with mock.patch("time.time", mock_time):
            refetching_data = self.qm.pop()

        self.assertEqual(refetching_data.delay, TWO_HOURS)
        self.assertEqual(refetching_data.source, Source.refetch)
示例#4
0
    def test_reschedule_different_content(self):
        ############################################
        # this one is in seen with a different hash and
        # not taken from priority queue
        # I expect: halving the delay and set seen counter to 1
        dm = DocumentMetadata("http://www.randomurl1.it")
        dm.depth = 1
        dm.dhash = 1936
        dm.source = Source.normal
        dm.delay = 20
        dm.alternatives = ["http://www.randomurl1.it"]

        self.qm.add_seen_and_reschedule(dm)

        # checking all the parameters
        counter = self.qm.seen.get(dm.url).get("count")
        dhash = self.qm.seen.get(dm.url).get("page_hash")
        self.assertEqual(counter, 1)
        self.assertEqual(dhash, dm.dhash)
        with mock.patch("time.time", mock_time):
            refetching_data = self.qm.pop()
        self.assertEqual(refetching_data.delay, dm.delay / 2)
        self.assertEqual(refetching_data.source, Source.refetch)
示例#5
0
    def test_reschedule3(self):
        ############################################
        # as before but with a small delay.
        # cheking delay not changing
        dm = DocumentMetadata("http://www.randomurl1.it")
        dm.depth = 1
        dm.dhash = 121212
        dm.source = Source.normal
        dm.delay = 500
        dm.alternatives = ["http://www.randomurl1.it"]

        self.qm.add_seen_and_reschedule(dm)

        # checking all the parameters
        counter = self.qm.seen.get(dm.url).get("count")
        dhash = self.qm.seen.get(dm.url).get("page_hash")
        self.assertEqual(counter, 1)
        self.assertEqual(dhash, dm.dhash)
        with mock.patch("time.time", mock_time):
            refetching_data = self.qm.pop()

        self.assertEqual(refetching_data.delay, dm.delay)
        self.assertEqual(refetching_data.source, Source.refetch)
示例#6
0
    def test_reschedule_samecontent_lastdelay(self):
        ############################################
        # testing rescheduling some urls
        # this one is in seen with the same hash and
        # not taken from priority queue
        # I expect: doublig the delay and set seen counter to 1
        dm = DocumentMetadata("http://www.randomurl1.it")
        dm.depth = 1
        dm.dhash = 12345
        dm.source = Source.normal
        dm.delay = 40
        # alternatives contains always at least one url.
        dm.alternatives = ["http://www.randomurl1.it"]
        # we want to check that former alternatives are also correctly
        # updated even if new alternatives field is different.
        alternatives = self.qm.seen.get(dm.url).get("alternatives")
        self.assertNotEqual(len(dm.alternatives), len(alternatives))

        self.qm.add_seen_and_reschedule(dm)

        # check all the parameters
        counter = self.qm.seen.get(dm.url).get("count")
        dhash = self.qm.seen.get(dm.url).get("page_hash")
        self.assertEqual(counter, 1)

        # check updated all the alternatives
        for urls in alternatives:
            counter = self.qm.seen.get(urls).get("count")
            dhash = self.qm.seen.get(dm.url).get("page_hash")
            self.assertEqual(counter, 1)
            self.assertEqual(dhash, dm.dhash)

        with mock.patch("time.time", mock_time):
            refetching_data = self.qm.pop()
        self.assertEqual(refetching_data.url, "")
        self.assertEqual(refetching_data.source, Source.unknown)