def test_do_not_process_robots_works(self): curi = CrawlUri() curi.effective_url = "http://127.0.0.1/robots.txt" curi.optional_vars = dict() l = limiter.DefaultLimiter(None) for i in range(2): l._do_not_process_robots(curi) self.assertEqual(CURI_OPTIONAL_TRUE, curi.optional_vars[CURI_EXTRACTION_FINISHED])
def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi
def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi
def test_that_creating_processing_function_works(self): settings = Settings() processors = settings.SPYDER_EXTRACTOR_PIPELINE processors.extend(settings.SPYDER_SCOPER_PIPELINE) processors.append('test_workerprocess') self.assertRaises(ValueError, workerprocess.create_processing_function, settings, processors) processors.pop() processors.append('test_workerprocess_unspec') self.assertRaises(ValueError, workerprocess.create_processing_function, settings, processors) processors.pop() processing = workerprocess.create_processing_function( settings, processors) curi = CrawlUri(optional_vars=dict()) curi.effective_url = "http://127.0.0.1/robots.txt" curi2 = processing(curi) self.assertEqual(CURI_OPTIONAL_TRUE, curi2.optional_vars[CURI_EXTRACTION_FINISHED])
def test_that_creating_processing_function_works(self): settings = Settings() processors = settings.SPYDER_EXTRACTOR_PIPELINE processors.extend(settings.SPYDER_SCOPER_PIPELINE) processors.append('test_workerprocess') self.assertRaises(ValueError, workerprocess.create_processing_function, settings, processors) processors.pop() processors.append('test_workerprocess_unspec') self.assertRaises(ValueError, workerprocess.create_processing_function, settings, processors) processors.pop() processing = workerprocess.create_processing_function(settings, processors) curi = CrawlUri(optional_vars=dict()) curi.effective_url = "http://127.0.0.1/robots.txt" curi2 = processing(curi) self.assertEqual(CURI_OPTIONAL_TRUE, curi2.optional_vars[CURI_EXTRACTION_FINISHED])