示例#1
0
    def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
示例#2
0
    def test_crawluri_from_uri_with_credentials(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        now_timestamp = time.mktime(now.timetuple())
        next_crawl_date = now + timedelta(days=1)
        next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple())

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))

        uri = ("http://*****:*****@localhost", "123", now_timestamp, 1,
            next_crawl_date_timestamp)

        curi = frontier._crawluri_from_uri(uri)

        self.assertEqual("http://*****:*****@localhost", curi.url)
        self.assertEqual("123", curi.req_header["Etag"])
        self.assertEqual(serialize_date_time(now),
            curi.req_header["Last-Modified"])
        self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME])
        self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
示例#3
0
    def test_with_multiple_active_queues(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 2
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://www.google.de")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(2, len(frontier._current_queues))

        next_url = frontier.get_next()
示例#4
0
    def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
示例#5
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
示例#6
0
    def test_crawluri_from_uri_with_credentials(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        now_timestamp = time.mktime(now.timetuple())
        next_crawl_date = now + timedelta(days=1)
        next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple())

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))

        uri = ("http://*****:*****@localhost", "123", now_timestamp, 1,
               next_crawl_date_timestamp)

        curi = frontier._crawluri_from_uri(uri)

        self.assertEqual("http://*****:*****@localhost", curi.url)
        self.assertEqual("123", curi.req_header["Etag"])
        self.assertEqual(serialize_date_time(now),
                         curi.req_header["Last-Modified"])
        self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME])
        self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
示例#7
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
示例#8
0
    def test_with_multiple_active_queues(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 2
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://www.google.de")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(2, len(frontier._current_queues))

        next_url = frontier.get_next()
示例#9
0
    def test_create_frontier_works(self):

        handler = logging.StreamHandler(sys.stdout)
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = masterprocess.create_frontier(s, handler)

        self.assertTrue(frontier is not None)
示例#10
0
    def test_create_frontier_works(self):

        handler = logging.StreamHandler(sys.stdout)
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = masterprocess.create_frontier(s, handler)

        self.assertTrue(frontier is not None)
示例#11
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = {
                "Etag": "123%s" % i,
                "Date": serialize_date_time(now)
            }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date,
                 prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri(
                    (url, etag, mod_date, next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
示例#12
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = { "Etag" : "123%s" % i, "Date" : serialize_date_time(now) }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri((url, etag, mod_date,
                            next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
示例#13
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
示例#14
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
示例#15
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri
示例#16
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri
示例#17
0
    def test_queues_work(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 1
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://foreignhost")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(1, len(frontier._current_queues))
        for q1 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q1])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q1])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q1 == 1:
            curi1.status_code = 500
            frontier.process_server_error(curi1)
        else:
            curi1.status_code = 500
            frontier.process_server_error(curi2)

        self.assertEquals(-1, frontier._budget_politeness[q1])

        frontier._cleanup_budget_politeness()

        self.assertEqual(1, len(frontier._current_queues))
        for q2 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q2])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q2])
 
        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q2 == 1:
            curi1.status_code = 200
            frontier.process_successful_crawl(curi1)
        else:
            curi2.status_code = 200
            frontier.process_successful_crawl(curi2)

        self.assertEquals(3, frontier._budget_politeness[q2])

        frontier._cleanup_budget_politeness()
示例#18
0
    def test_queues_work(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 1
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://foreignhost")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(1, len(frontier._current_queues))
        for q1 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q1])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q1])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q1 == 1:
            curi1.status_code = 500
            frontier.process_server_error(curi1)
        else:
            curi1.status_code = 500
            frontier.process_server_error(curi2)

        self.assertEquals(-1, frontier._budget_politeness[q1])

        frontier._cleanup_budget_politeness()

        self.assertEqual(1, len(frontier._current_queues))
        for q2 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q2])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q2])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q2 == 1:
            curi1.status_code = 200
            frontier.process_successful_crawl(curi1)
        else:
            curi2.status_code = 200
            frontier.process_successful_crawl(curi2)

        self.assertEquals(3, frontier._budget_politeness[q2])

        frontier._cleanup_budget_politeness()