def test_updating_works(self):

        uri = ("http://localhost", 1, "etag", int(time.time() * 1000),
               int(time.time() * 1000), 1)

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uri(uri)

        uri = ("http://localhost", 1, "etag", int(time.time() * 1000),
               int(time.time() * 1000), 2)

        q.update_uri(uri)

        cursor = q._connection.execute("SELECT * FROM queues WHERE queue=1")
        uri_res = cursor.fetchone()
        (url, queue, etag, mod_date, next_date, prio) = uri
        (url_res, queue_res, etag_res, mod_date_res, next_date_res,
         prio_res) = uri_res
        self.assertEqual(url, url_res)
        self.assertEqual(etag, etag_res)
        self.assertEqual(mod_date, mod_date_res)
        self.assertEqual(prio, prio_res)
        self.assertEqual(next_date, next_date_res)

        q.close()
    def test_that_queues_work(self):

        q = SQLiteMultipleHostUriQueue(':memory:')

        for queue in q.get_all_queues():
            self.assertFalse(True)

        qid1 = q.add_or_create_queue('test')

        for (queue, ident) in q.get_all_queues():
            self.assertEqual(qid1, queue)
            self.assertEqual('test', ident)

        qid2 = q.add_or_create_queue('test2')

        i = 0
        for (queue, ident) in q.get_all_queues():
            if i == 0:
                self.assertEqual(qid1, queue)
                self.assertEqual('test', ident)
                i += 1
            else:
                self.assertEqual(qid2, queue)
                self.assertEqual('test2', ident)

        self.assertEqual(qid1, q.add_or_create_queue('test'))
    def test_removing_lists_works(self):

        uris = [("http://localhost", 1, "etag", int(time.time()*1000),
                int(time.time() * 1000), 1),
            ("http://fogeignhost", 1, "ETAG", int(time.time()*1000),
             int(time.time() * 1000), 2),
        ]

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uris(uris)

        q.remove_uris(uris)

        cursor = q._connection.execute("SELECT * FROM queues WHERE queue=1")
        self.assertTrue(None is cursor.fetchone())
示例#4
0
    def __init__(self, settings, log_handler):
        """
        Initialize the abstract base frontier and this implementation with the
        different configuration parameters.
        """
        prio_clazz = import_class(settings.PRIORITIZER_CLASS)
        AbstractBaseFrontier.__init__(
            self, settings, log_handler,
            SQLiteMultipleHostUriQueue(settings.FRONTIER_STATE_FILE),
            prio_clazz(settings))

        self._delay_factor = settings.FRONTIER_CRAWL_DELAY_FACTOR
        self._min_delay = settings.FRONTIER_MIN_DELAY
        self._num_active_queues = settings.FRONTIER_ACTIVE_QUEUES
        self._max_queue_budget = settings.FRONTIER_QUEUE_BUDGET
        self._budget_punishment = settings.FRONTIER_QUEUE_BUDGET_PUNISH

        self._queue_ids = []
        for (queue, _) in self._front_end_queues.get_all_queues():
            self._queue_ids.append(queue)

        qs_clazz = import_class(settings.QUEUE_SELECTOR_CLASS)
        self._backend_selector = qs_clazz(len(self._queue_ids))

        qa_clazz = import_class(settings.QUEUE_ASSIGNMENT_CLASS)
        self._backend_assignment = qa_clazz(self._dns_cache)

        self._current_queues = dict()
        self._current_queues_in_heap = []
        self._time_politeness = dict()
        self._budget_politeness = dict()
    def test_that_queues_work(self):

        q = SQLiteMultipleHostUriQueue(':memory:')

        for queue in q.get_all_queues():
            self.assertFalse(True)

        qid1 = q.add_or_create_queue('test')
        
        for (queue, ident) in q.get_all_queues():
            self.assertEqual(qid1, queue)
            self.assertEqual('test', ident)

        qid2 = q.add_or_create_queue('test2')

        i = 0
        for (queue, ident) in q.get_all_queues():
            if i==0:
                self.assertEqual(qid1, queue)
                self.assertEqual('test', ident)
                i += 1
            else:
                self.assertEqual(qid2, queue)
                self.assertEqual('test2', ident)

        self.assertEqual(qid1, q.add_or_create_queue('test'))
    def test_adding_lists_works(self):

        uris = [("http://localhost", 1, "etag", int(time.time()*1000),
                int(time.time() * 1010), 1),
        ]

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uris(uris)

        cursor = q._connection.execute("SELECT * FROM queues WHERE queue=1")
        uri_res = cursor.fetchone()
        (url, queue, etag, mod_date, next_date, prio) = uris[0]
        (url_res, queue_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
        self.assertEqual(url, url_res)
        self.assertEqual(etag, etag_res)
        self.assertEqual(mod_date, mod_date_res)
        self.assertEqual(prio, prio_res)
        self.assertEqual(next_date, next_date_res)
    def test_iterating_over_all_uris_works(self):

        uris = [("http://localhost", 1, "etag", int(time.time()*1000),
                int(time.time() * 1000), 1),
            ("http://foreignhost", 1, "ETAG", int(time.time()*1000),
             int(time.time() * 1000), 2),
        ]
        urls = ["http://localhost", "http://foreignhost"]

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uris(uris)

        uri = q.get_uri("http://foreignhost")
        self.assertEqual(uris[1], uri)

        self.assertRaises(UriNotFound, q.get_uri, "http://gibtsnuesch")

        for url in q.all_uris():
            self.assertTrue(url in urls)
    def test_removing_lists_works(self):

        uris = [
            ("http://localhost", 1, "etag", int(time.time() * 1000),
             int(time.time() * 1000), 1),
            ("http://fogeignhost", 1, "ETAG", int(time.time() * 1000),
             int(time.time() * 1000), 2),
        ]

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uris(uris)

        q.remove_uris(uris)

        cursor = q._connection.execute("SELECT * FROM queues WHERE queue=1")
        self.assertTrue(None is cursor.fetchone())
    def test_iterating_over_all_uris_works(self):

        uris = [
            ("http://localhost", 1, "etag", int(time.time() * 1000),
             int(time.time() * 1000), 1),
            ("http://foreignhost", 1, "ETAG", int(time.time() * 1000),
             int(time.time() * 1000), 2),
        ]
        urls = ["http://localhost", "http://foreignhost"]

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uris(uris)

        uri = q.get_uri("http://foreignhost")
        self.assertEqual(uris[1], uri)

        self.assertRaises(UriNotFound, q.get_uri, "http://gibtsnuesch")

        for url in q.all_uris():
            self.assertTrue(url in urls)
    def test_queue_head_works(self):

        uris = [("http://localhost", 1, "etag", int(time.time()*1000),
                int(time.time() * 1000), 1),
            ("http://fogeignhost", 1, "ETAG", int(time.time()*1000),
             int(time.time() * 1001), 2),
        ]

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uris(uris)

        self.assertEqual(2, q.qsize())
        self.assertEqual(2, q.qsize(queue=1))

        (url1, queue1,  etag1, mod_date1, next_date1, prio1) = uris[0]
        (url2, queue2, etag2, mod_date2, next_date2, prio2) = uris[1]

        for uri_res in q.queue_head(1, n=1, offset=0):
            (url_res, queue_res,  etag_res, mod_date_res, next_date_res,
                prio_res) = uri_res
            self.assertEqual(url1, url_res)
            self.assertEqual(queue1, queue_res)
            self.assertEqual(etag1, etag_res)
            self.assertEqual(mod_date1, mod_date_res)
            self.assertEqual(prio1, prio_res)
            self.assertEqual(next_date1, next_date_res)

        for uri_res in q.queue_head(1, n=1, offset=1):
            (url_res, queue_res,  etag_res, mod_date_res, next_date_res,
                prio_res) = uri_res
            self.assertEqual(url2, url_res)
            self.assertEqual(queue2, queue_res)
            self.assertEqual(etag2, etag_res)
            self.assertEqual(mod_date2, mod_date_res)
            self.assertEqual(prio2, prio_res)
            self.assertEqual(next_date2, next_date_res)

        uris.append(("http://localhost/1", 1, "eTag", int(time.time()*1000),
                    int(time.time()*1002), 1))
        (url3, queue3, etag3, mod_date3, next_date3, prio3) = uris[2]
        q.add_uri(uris[2])

        self.assertEqual(3, q.qsize())
        self.assertEqual(3, q.qsize(queue=1))

        q.ignore_uri("http://localhost", 404)

        for uri_res in q.queue_head(1, n=1, offset=1):
            (url_res, queue_res,  etag_res, mod_date_res, next_date_res,
                prio_res) = uri_res
            self.assertEqual(url3, url_res)
            self.assertEqual(queue3, queue_res)
            self.assertEqual(etag3, etag_res)
            self.assertEqual(mod_date3, mod_date_res)
            self.assertEqual(prio3, prio_res)
            self.assertEqual(next_date3, next_date_res)

        uris.append(("http://localhost2/1", 2, "eTag", int(time.time()*1000),
                    int(time.time()*1002), 1))
        (url4, queue4, etag4, mod_date4, next_date4, prio4) = uris[3]
        q.add_uri(uris[3])

        self.assertEqual(4, q.qsize())
        self.assertEqual(2, q.qsize(queue=1))
        self.assertEqual(1, q.qsize(queue=2))

        for uri_res in q.queue_head(2, n=1, offset=0):
            (url_res, queue_res,  etag_res, mod_date_res, next_date_res,
                prio_res) = uri_res
            self.assertEqual(url4, url_res)
            self.assertEqual(queue4, queue_res)
            self.assertEqual(etag4, etag_res)
            self.assertEqual(mod_date4, mod_date_res)
            self.assertEqual(prio4, prio_res)
            self.assertEqual(next_date4, next_date_res)
    def test_queue_head_works(self):

        uris = [
            ("http://localhost", 1, "etag", int(time.time() * 1000),
             int(time.time() * 1000), 1),
            ("http://fogeignhost", 1, "ETAG", int(time.time() * 1000),
             int(time.time() * 1001), 2),
        ]

        q = SQLiteMultipleHostUriQueue(":memory:")
        q.add_uris(uris)

        self.assertEqual(2, q.qsize())
        self.assertEqual(2, q.qsize(queue=1))

        (url1, queue1, etag1, mod_date1, next_date1, prio1) = uris[0]
        (url2, queue2, etag2, mod_date2, next_date2, prio2) = uris[1]

        for uri_res in q.queue_head(1, n=1, offset=0):
            (url_res, queue_res, etag_res, mod_date_res, next_date_res,
             prio_res) = uri_res
            self.assertEqual(url1, url_res)
            self.assertEqual(queue1, queue_res)
            self.assertEqual(etag1, etag_res)
            self.assertEqual(mod_date1, mod_date_res)
            self.assertEqual(prio1, prio_res)
            self.assertEqual(next_date1, next_date_res)

        for uri_res in q.queue_head(1, n=1, offset=1):
            (url_res, queue_res, etag_res, mod_date_res, next_date_res,
             prio_res) = uri_res
            self.assertEqual(url2, url_res)
            self.assertEqual(queue2, queue_res)
            self.assertEqual(etag2, etag_res)
            self.assertEqual(mod_date2, mod_date_res)
            self.assertEqual(prio2, prio_res)
            self.assertEqual(next_date2, next_date_res)

        uris.append(("http://localhost/1", 1, "eTag", int(time.time() * 1000),
                     int(time.time() * 1002), 1))
        (url3, queue3, etag3, mod_date3, next_date3, prio3) = uris[2]
        q.add_uri(uris[2])

        self.assertEqual(3, q.qsize())
        self.assertEqual(3, q.qsize(queue=1))

        q.ignore_uri("http://localhost", 404)

        for uri_res in q.queue_head(1, n=1, offset=1):
            (url_res, queue_res, etag_res, mod_date_res, next_date_res,
             prio_res) = uri_res
            self.assertEqual(url3, url_res)
            self.assertEqual(queue3, queue_res)
            self.assertEqual(etag3, etag_res)
            self.assertEqual(mod_date3, mod_date_res)
            self.assertEqual(prio3, prio_res)
            self.assertEqual(next_date3, next_date_res)

        uris.append(("http://localhost2/1", 2, "eTag", int(time.time() * 1000),
                     int(time.time() * 1002), 1))
        (url4, queue4, etag4, mod_date4, next_date4, prio4) = uris[3]
        q.add_uri(uris[3])

        self.assertEqual(4, q.qsize())
        self.assertEqual(2, q.qsize(queue=1))
        self.assertEqual(1, q.qsize(queue=2))

        for uri_res in q.queue_head(2, n=1, offset=0):
            (url_res, queue_res, etag_res, mod_date_res, next_date_res,
             prio_res) = uri_res
            self.assertEqual(url4, url_res)
            self.assertEqual(queue4, queue_res)
            self.assertEqual(etag4, etag_res)
            self.assertEqual(mod_date4, mod_date_res)
            self.assertEqual(prio4, prio_res)
            self.assertEqual(next_date4, next_date_res)