def test_static_dns_mapping(self): s = Settings() s.STATIC_DNS_MAPPINGS = {"localhost:123": ("-1.-1.-1.-1", 123)} dns = DnsCache(s) self.assertEqual(("-1.-1.-1.-1", 123), dns["localhost:123"]) self.assertEqual(('127.0.0.1', 80), dns["localhost:80"]) self.assertTrue(1, len(dns._cache))
def test_dns_cache(self): s = Settings() s.SIZE_DNS_CACHE = 1 dns = DnsCache(s) self.assertEqual(('127.0.0.1', 80), dns["localhost:80"]) self.assertEqual(('127.0.0.1', 81), dns["localhost:81"]) self.assertTrue(1, len(dns._cache))
def test_crawluri_from_uri_with_credentials(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) now_timestamp = time.mktime(now.timetuple()) next_crawl_date = now + timedelta(days=1) next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple()) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) uri = ("http://*****:*****@localhost", "123", now_timestamp, 1, next_crawl_date_timestamp) curi = frontier._crawluri_from_uri(uri) self.assertEqual("http://*****:*****@localhost", curi.url) self.assertEqual("123", curi.req_header["Etag"]) self.assertEqual(serialize_date_time(now), curi.req_header["Last-Modified"]) self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME]) self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
def test_that_adding_uris_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) curi = CrawlUri("http://localhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 2 frontier.add_uri(curi) cur = frontier._front_end_queues._cursor curi = CrawlUri("http://foreignhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 1 frontier.add_uri(curi) idents = {"localhost": -1, "foreignhost": -1} cur.execute("SELECT * FROM queue_identifiers") for row in cur: self.assertTrue(row['identifier'] in idents.keys()) idents["http://%s" % row['identifier']] = row['queue'] cur.execute("SELECT * FROM queues") for row in cur: self.assertEqual(idents[row['url']], row['queue']) self.assertEqual(2, frontier._front_end_queues.get_queue_count())
def test_with_multiple_active_queues(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" s.FRONTIER_ACTIVE_QUEUES = 2 s.FRONTIER_QUEUE_BUDGET = 4 s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) curi1 = CrawlUri("http://localhost") curi1.current_priority = 2 curi1.req_time = 0.4 frontier.add_uri(curi1) cur = frontier._front_end_queues._cursor curi2 = CrawlUri("http://www.google.de") curi2.current_priority = 1 curi2.req_time = 1.4 frontier.add_uri(curi2) self.assertEqual(0, len(frontier._current_queues)) frontier._maybe_add_queues() self.assertEqual(2, len(frontier._current_queues)) next_url = frontier.get_next()
def test_sinks(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_sink(AbstractCrawlUriSink()) curi = CrawlUri("http://localhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 2 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_successful_crawl(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_not_found(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_redirect(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_server_error(curi)
def test_crawluri_from_uri_with_credentials(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) now_timestamp = time.mktime(now.timetuple()) next_crawl_date = now + timedelta(days=1) next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple()) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) uri = ("http://*****:*****@localhost", "123", now_timestamp, 1, next_crawl_date_timestamp) curi = frontier._crawluri_from_uri(uri) self.assertEqual("http://*****:*****@localhost", curi.url) self.assertEqual("123", curi.req_header["Etag"]) self.assertEqual(serialize_date_time(now), curi.req_header["Last-Modified"]) self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME]) self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
def test_that_adding_uris_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier.add_uri(curi) cur = frontier._front_end_queues._cursor curi = CrawlUri("http://foreignhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 1 frontier.add_uri(curi) idents = {"localhost": -1, "foreignhost": -1} cur.execute("SELECT * FROM queue_identifiers") for row in cur: self.assertTrue(row['identifier'] in idents.keys()) idents["http://%s" % row['identifier']] = row['queue'] cur.execute("SELECT * FROM queues") for row in cur: self.assertEqual(idents[row['url']], row['queue']) self.assertEqual(2, frontier._front_end_queues.get_queue_count())
def test_sinks(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_sink(AbstractCrawlUriSink()) curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_successful_crawl(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_not_found(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_redirect(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_server_error(curi)
def test_create_frontier_works(self): handler = logging.StreamHandler(sys.stdout) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = masterprocess.create_frontier(s, handler) self.assertTrue(frontier is not None)
def test_that_updating_heap_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) q1 = [] q2 = [] now = datetime( *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta( days=2) for i in range(1, 20): curi = CrawlUri("http://localhost/test/%s" % i) curi.current_priority = (i % 2 + 1) curi.rep_header = { "Etag": "123%s" % i, "Date": serialize_date_time(now) } frontier.add_uri(curi) if i % 2 == 0: (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi) next_date = next_date - 1000 * 60 * 5 frontier._front_end_queues.update_uri( (url, etag, mod_date, next_date, prio)) q2.append(curi.url) else: q1.append(curi.url) self.assertRaises(Empty, frontier._heap.get_nowait) for i in range(1, 10): frontier._next_possible_crawl = time.time() candidate_uri = frontier.get_next() if candidate_uri.url in q1: self.assertTrue(candidate_uri.url in q1) q1.remove(candidate_uri.url) elif candidate_uri.url in q2: self.assertTrue(candidate_uri.url in q2) q2.remove(candidate_uri.url) self.assertEqual(10, len(q1)) self.assertEqual(0, len(q2)) self.assertRaises(Empty, frontier.get_next)
def test_fetching_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, ) msg = DataMessage() msg.identity = "me" msg.curi = curi self._worker_sockets['master_push'].send_multipart(msg.serialize()) def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open( os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._io_loop.start()
def spyder_management(settings): """ Start new master/worker/logsink processes. """ from spyder import logsink import spyder.workerprocess as worker import spyder.masterprocess as master effective_settings = Settings(settings) args = [a.lower() for a in sys.argv] if "master" in args: args.remove("master") master.main(effective_settings) elif "worker" in args: worker.main(effective_settings) elif "logsink" in args: logsink.main(effective_settings) else: print >> sys.stderr, """Usage: spyder-ctrl [master|worker|logsink] 'master'\t\tstart a master process. 'worker'\t\tstart a worker process. 'logsink'\t\tstart a sink for logmessages. """ sys.exit(1)
def setUp(self): # create the io_loop self._io_loop = IOLoop.instance() # and the context self._ctx = zmq.Context(1) self._settings = Settings() self._settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push' self._settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \ self._settings.ZEROMQ_MASTER_PUSH self._settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub' self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \ self._settings.ZEROMQ_MASTER_SUB self._settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master' self._settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker' # setup the mgmt sockets self._setup_mgmt_sockets() # setup the data sockets self._setup_data_servers() # setup the management interface self._mgmt = ZmqMgmt(self._mgmt_sockets['worker_sub'], self._mgmt_sockets['worker_pub'], io_loop=self._io_loop) self._mgmt.start() self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
def test_that_updating_heap_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) q1 = [] q2 = [] now = datetime(*datetime.fromtimestamp( time.time()).timetuple()[0:6]) - timedelta(days=2) for i in range(1, 20): curi = CrawlUri("http://localhost/test/%s" % i) curi.current_priority = (i % 2 + 1) curi.rep_header = { "Etag" : "123%s" % i, "Date" : serialize_date_time(now) } frontier.add_uri(curi) if i % 2 == 0: (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi) next_date = next_date - 1000 * 60 * 5 frontier._front_end_queues.update_uri((url, etag, mod_date, next_date, prio)) q2.append(curi.url) else: q1.append(curi.url) self.assertRaises(Empty, frontier._heap.get_nowait) for i in range(1, 10): frontier._next_possible_crawl = time.time() candidate_uri = frontier.get_next() if candidate_uri.url in q1: self.assertTrue(candidate_uri.url in q1) q1.remove(candidate_uri.url) elif candidate_uri.url in q2: self.assertTrue(candidate_uri.url in q2) q2.remove(candidate_uri.url) self.assertEqual(10, len(q1)) self.assertEqual(0, len(q2)) self.assertRaises(Empty, frontier.get_next)
def test_loading_default_settings_works(self): from spyder import defaultsettings from spyder.core.settings import Settings settings = Settings() self.assertEqual(defaultsettings.ZEROMQ_MGMT_MASTER, settings.ZEROMQ_MGMT_MASTER)
def test_that_content_type_restriction_works(self): xtor = DefaultHtmlLinkExtractor(Settings()) curi = CrawlUri() curi.rep_header = dict() curi.rep_header["Content-Type"] = "text/html" self.assertTrue(xtor._restrict_content_type(curi)) curi.rep_header["Content-Type"] = "pille/palle" self.assertFalse(xtor._restrict_content_type(curi))
def test_loading_custom_settings_works(self): from spyder import defaultsettings from spyder.core.settings import Settings import test_settings_settings settings = Settings(test_settings_settings) self.assertEqual(test_settings_settings.ZEROMQ_MGMT_WORKER, settings.ZEROMQ_MGMT_WORKER)
def test_that_creating_mgmt_works(self): ctx = zmq.Context() io_loop = IOLoop.instance() def stop_looping(_msg): io_loop.stop() settings = Settings() settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push' settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \ settings.ZEROMQ_MASTER_PUSH settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub' settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \ settings.ZEROMQ_MASTER_SUB settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master' settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker' pubsocket = ctx.socket(zmq.PUB) pubsocket.bind(settings.ZEROMQ_MGMT_MASTER) pub_stream = ZMQStream(pubsocket, io_loop) subsocket = ctx.socket(zmq.SUB) subsocket.setsockopt(zmq.SUBSCRIBE, "") subsocket.bind(settings.ZEROMQ_MGMT_WORKER) sub_stream = ZMQStream(subsocket, io_loop) mgmt = workerprocess.create_worker_management(settings, ctx, io_loop) mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, stop_looping) mgmt.start() def assert_quit_message(msg): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg.data) sub_stream.on_recv(assert_quit_message) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) pub_stream.send_multipart(death.serialize()) io_loop.start() mgmt._out_stream.close() mgmt._in_stream.close() mgmt._publisher.close() mgmt._subscriber.close() pub_stream.close() pubsocket.close() sub_stream.close() subsocket.close() ctx.term()
def test_regex_scoper(self): curi = CrawlUri() curi.optional_vars = dict() curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([ "http://www.google.de/index.html", "ftp://www.google.de/pillepalle.avi", ]) settings = Settings() settings.REGEX_SCOPE_POSITIVE = ['^.*\.html'] settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi'] scoper = RegexScoper(settings) curi = scoper(curi) print curi.optional_vars[CURI_EXTRACTED_URLS] self.assertTrue("http://www.google.de/index.html" in curi.optional_vars[CURI_EXTRACTED_URLS]) self.assertFalse("ftp://www.google.de/pillepalle.avi" in curi.optional_vars[CURI_EXTRACTED_URLS])
def test_that_time_based_politeness_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp( time.time()).timetuple()[0:6]) - timedelta(days=2) curi = CrawlUri("http://localhost/test") curi.current_priority = 3 curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.req_time = 0.5 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) a = frontier._next_possible_crawl frontier.process_successful_crawl(curi) self.assertTrue(frontier._next_possible_crawl > a) self.assertTrue(frontier._next_possible_crawl > time.time()) self.assertRaises(Empty, frontier.get_next)
def test_that_time_based_politeness_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime( *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta( days=2) curi = CrawlUri("http://localhost/test") curi.current_priority = 3 curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.req_time = 0.5 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) a = frontier._next_possible_crawl frontier.process_successful_crawl(curi) self.assertTrue(frontier._next_possible_crawl > a) self.assertTrue(frontier._next_possible_crawl > time.time()) self.assertRaises(Empty, frontier.get_next)
def test_adding_uri_works(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" curi = CrawlUri("http://localhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 2 frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_uri(curi) for uri in frontier._front_end_queues.queue_head(): (url, etag, mod_date, queue, next_date) = uri self.assertEqual("http://localhost", url) self.assertEqual("123", etag) self.assertEqual(now, datetime.fromtimestamp(mod_date)) frontier._current_uris[url] = uri
def test_that_cleaning_qs_works(self): s = Settings() c = CleanupQueryString(s) self.assertEqual( "http://tesT.com/t.html?p=a", c._cleanup_query_string("http://tesT.com/t.html?p=a#top")) self.assertEqual( "http://test.com/t.html", c._cleanup_query_string("http://test.com/t.html?#top")) self.assertEqual( "http://test.com/t.html?test=a", c._cleanup_query_string("http://test.com/t.html?test=a&"))
def test_adding_uri_works(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_uri(curi) for uri in frontier._front_end_queues.queue_head(): (url, etag, mod_date, queue, next_date) = uri self.assertEqual("http://localhost", url) self.assertEqual("123", etag) self.assertEqual(now, datetime.fromtimestamp(mod_date)) frontier._current_uris[url] = uri
def test_that_with_uri_works(self): s = StripSessionIds(Settings()) urls = [ "http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2", ] curi = CrawlUri() curi.optional_vars = {CURI_EXTRACTED_URLS: "\n".join(urls)} curi = s(curi) clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n') for u in clean_urls: self.assertEqual("http://preis.de/traeger/index.php?", u)
def test_missing_encoding_works(self): src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \ "<a title='ups i did it again' href ='/relative.html'>und " + \ "noch mehr!</a><a href='evenmorerelative.html'>" curi = CrawlUri() curi.rep_header = dict() curi.rep_header["Content-Type"] = "text/html" curi.url = "http://www.bmg.bund.de/test/" curi.content_body = src curi.optional_vars = dict() xtor = DefaultHtmlLinkExtractor(Settings()) curi = xtor(curi) links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n") self.assertEqual("http://www.google.de", links[0]) self.assertEqual("http://www.bmg.bund.de/relative.html", links[1]) self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html", links[2])
def test_link_extraction_works(self): src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \ "<a title='ups i did it again' href ='/relative.html'>und " + \ "noch mehr!</a><a href='evenmorerelative.html'/>" + \ "<a href='mailto:muster@bfarm.de'/>" curi = CrawlUri() curi.rep_header = dict() curi.rep_header["Content-Type"] = "text/html; charset=utf-8" curi.url = "http://www.bmg.bund.de/test/" curi.content_body = src curi.optional_vars = dict() xtor = DefaultHtmlLinkExtractor(Settings()) curi = xtor(curi) links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n") self.assertEqual("http://www.google.de", links[0]) self.assertEqual("http://www.bmg.bund.de/relative.html", links[1]) self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html", links[2])
def test_that_creating_fetcher_works(self): ctx = zmq.Context() io_loop = IOLoop.instance() def stop_looping(_msg): io_loop.stop() settings = Settings() master_push = ctx.socket(zmq.PUSH) master_push.bind(settings.ZEROMQ_MASTER_PUSH) fetcher = workerprocess.create_worker_fetcher( settings, {}, ctx, StreamHandler(sys.stdout), io_loop) self.assertTrue(isinstance(fetcher._processing, FetchProcessor)) self.assertTrue(isinstance(fetcher, AsyncZmqWorker)) fetcher._insocket.close() fetcher._outsocket.close() master_push.close() ctx.term()
def test_that_stripping_session_stuff_works(self): s = StripSessionIds(Settings()) url = "http://pREis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2" self.assertEqual("http://pREis.de/traeger/index.php?", s._remove_session_ids(url)) url = "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2" self.assertEqual("http://preis.de/traeger/index.php?", s._remove_session_ids(url)) url = "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2" self.assertEqual("http://preis.de/traeger/index.php?", s._remove_session_ids(url)) url = "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2" self.assertEqual("http://preis.de/traeger/index.php?", s._remove_session_ids(url))
def test_that_creating_processing_function_works(self): settings = Settings() processors = settings.SPYDER_EXTRACTOR_PIPELINE processors.extend(settings.SPYDER_SCOPER_PIPELINE) processors.append('test_workerprocess') self.assertRaises(ValueError, workerprocess.create_processing_function, settings, processors) processors.pop() processors.append('test_workerprocess_unspec') self.assertRaises(ValueError, workerprocess.create_processing_function, settings, processors) processors.pop() processing = workerprocess.create_processing_function( settings, processors) curi = CrawlUri(optional_vars=dict()) curi.effective_url = "http://127.0.0.1/robots.txt" curi2 = processing(curi) self.assertEqual(CURI_OPTIONAL_TRUE, curi2.optional_vars[CURI_EXTRACTION_FINISHED])
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp( os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={"Last-Modified": mtime}) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_etag_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={ "Etag": "\"3926227169c58185234888b60000c6eb1169577d\"" }) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_queues_work(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" s.FRONTIER_ACTIVE_QUEUES = 1 s.FRONTIER_QUEUE_BUDGET = 4 s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) curi1 = CrawlUri("http://localhost") curi1.current_priority = 2 curi1.req_time = 0.4 frontier.add_uri(curi1) cur = frontier._front_end_queues._cursor curi2 = CrawlUri("http://foreignhost") curi2.current_priority = 1 curi2.req_time = 1.4 frontier.add_uri(curi2) self.assertEqual(0, len(frontier._current_queues)) frontier._maybe_add_queues() self.assertEqual(1, len(frontier._current_queues)) for q1 in frontier._current_queues.keys(): pass self.assertEquals(4, frontier._budget_politeness[q1]) frontier._cleanup_budget_politeness() self.assertEquals(4, frontier._budget_politeness[q1]) frontier._update_heap() self.assertEqual(1, len(frontier._current_queues)) if q1 == 1: curi1.status_code = 500 frontier.process_server_error(curi1) else: curi1.status_code = 500 frontier.process_server_error(curi2) self.assertEquals(-1, frontier._budget_politeness[q1]) frontier._cleanup_budget_politeness() self.assertEqual(1, len(frontier._current_queues)) for q2 in frontier._current_queues.keys(): pass self.assertEquals(4, frontier._budget_politeness[q2]) frontier._cleanup_budget_politeness() self.assertEquals(4, frontier._budget_politeness[q2]) frontier._update_heap() self.assertEqual(1, len(frontier._current_queues)) if q2 == 1: curi1.status_code = 200 frontier.process_successful_crawl(curi1) else: curi2.status_code = 200 frontier.process_successful_crawl(curi2) self.assertEquals(3, frontier._budget_politeness[q2]) frontier._cleanup_budget_politeness()