def test_updating_lists_works(self): uris = [ ("http://localhost", "etag", int(time.time() * 1000), int(time.time() * 1000), 1), ] q = SQLiteSingleHostUriQueue(":memory:") q.add_uris(uris) uris = [ ("http://localhost", "etag", int(time.time() * 1000), int(time.time() * 1000), 2), ] q.update_uris(uris) cursor = q._connection.execute("SELECT * FROM queue") uri_res = cursor.fetchone() (url, etag, mod_date, next_date, prio) = uris[0] (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res self.assertEqual(url, url_res) self.assertEqual(etag, etag_res) self.assertEqual(mod_date, mod_date_res) self.assertEqual(prio, prio_res) self.assertEqual(next_date, next_date_res)
def test_crawluri_from_uri_with_credentials(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) now_timestamp = time.mktime(now.timetuple()) next_crawl_date = now + timedelta(days=1) next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple()) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) uri = ("http://*****:*****@localhost", "123", now_timestamp, 1, next_crawl_date_timestamp) curi = frontier._crawluri_from_uri(uri) self.assertEqual("http://*****:*****@localhost", curi.url) self.assertEqual("123", curi.req_header["Etag"]) self.assertEqual(serialize_date_time(now), curi.req_header["Last-Modified"]) self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME]) self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
def test_sinks(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_sink(AbstractCrawlUriSink()) curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_successful_crawl(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_not_found(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_redirect(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_server_error(curi)
def __init__(self, settings, log_handler): """ Initialize the base frontier. """ prio_clazz = import_class(settings.PRIORITIZER_CLASS) AbstractBaseFrontier.__init__( self, settings, log_handler, SQLiteSingleHostUriQueue(settings.FRONTIER_STATE_FILE), prio_clazz(settings)) self._crawl_delay = settings.FRONTIER_CRAWL_DELAY_FACTOR self._min_delay = settings.FRONTIER_MIN_DELAY self._next_possible_crawl = time.time()
def test_queue_head_works(self): uris = [ ("http://localhost", "etag", int(time.time() * 1000), int(time.time() * 1000), 1), ("http://fogeignhost", "ETAG", int(time.time() * 1000), int(time.time() * 1001), 2), ] q = SQLiteSingleHostUriQueue(":memory:") q.add_uris(uris) (url1, etag1, mod_date1, next_date1, prio1) = uris[0] (url2, etag2, mod_date2, next_date2, prio2) = uris[1] for uri_res in q.queue_head(n=1, offset=0): (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res self.assertEqual(url1, url_res) self.assertEqual(etag1, etag_res) self.assertEqual(mod_date1, mod_date_res) self.assertEqual(prio1, prio_res) self.assertEqual(next_date1, next_date_res) for uri_res in q.queue_head(n=1, offset=1): (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res self.assertEqual(url2, url_res) self.assertEqual(etag2, etag_res) self.assertEqual(mod_date2, mod_date_res) self.assertEqual(prio2, prio_res) self.assertEqual(next_date2, next_date_res) uris.append(("http://localhost/1", "eTag", int(time.time() * 1000), int(time.time() * 1002), 1)) (url3, etag3, mod_date3, next_date3, prio3) = uris[2] q.add_uri(uris[2]) q.ignore_uri("http://localhost", 404) for uri_res in q.queue_head(n=1, offset=1): (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res self.assertEqual(url3, url_res) self.assertEqual(etag3, etag_res) self.assertEqual(mod_date3, mod_date_res) self.assertEqual(prio3, prio_res) self.assertEqual(next_date3, next_date_res)
def test_removing_lists_works(self): uris = [ ("http://localhost", "etag", int(time.time() * 1000), int(time.time() * 1000), 1), ("http://fogeignhost", "ETAG", int(time.time() * 1000), int(time.time() * 1000), 2), ] q = SQLiteSingleHostUriQueue(":memory:") q.add_uris(uris) q.remove_uris(uris) cursor = q._connection.execute("SELECT * FROM queue") self.assertTrue(None is cursor.fetchone())
def test_iterating_over_all_uris_works(self): uris = [ ("http://localhost", "etag", int(time.time() * 1000), int(time.time() * 1000), 1), ("http://foreignhost", "ETAG", int(time.time() * 1000), int(time.time() * 1000), 2), ] urls = ["http://localhost", "http://foreignhost"] q = SQLiteSingleHostUriQueue(":memory:") q.add_uris(uris) uri = q.get_uri("http://foreignhost") self.assertEqual(uris[1], uri) self.assertRaises(UriNotFound, q.get_uri, "http://gibtsnuesch") for url in q.all_uris(): self.assertTrue(url in urls)
def test_adding_works(self): uri = ("http://localhost", "etag", int(time.time() * 1000), int(time.time() * 1000), 1) q = SQLiteSingleHostUriQueue(":memory:") q.add_uri(uri) self.assertEqual(1, len(q)) cursor = q._connection.execute("SELECT * FROM queue") uri_res = cursor.fetchone() (url, etag, mod_date, next_date, prio) = uri (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res self.assertEqual(url, url_res) self.assertEqual(etag, etag_res) self.assertEqual(mod_date, mod_date_res) self.assertEqual(prio, prio_res) self.assertEqual(next_date, next_date_res) q.close()
def test_adding_uri_works(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_uri(curi) for uri in frontier._front_end_queues.queue_head(): (url, etag, mod_date, queue, next_date) = uri self.assertEqual("http://localhost", url) self.assertEqual("123", etag) self.assertEqual(now, datetime.fromtimestamp(mod_date)) frontier._current_uris[url] = uri