def test_do_not_process_robots_works(self): curi = CrawlUri() curi.effective_url = "http://127.0.0.1/robots.txt" curi.optional_vars = dict() l = limiter.DefaultLimiter(None) for i in range(2): l._do_not_process_robots(curi) self.assertEqual(CURI_OPTIONAL_TRUE, curi.optional_vars[CURI_EXTRACTION_FINISHED])
def test_only_on_redirect(self): s = Settings() curi = CrawlUri("http://localhost") curi.status_code = 200 curi.rep_header = {"Location": "http://localhost/index.html"} curi.optional_vars = dict() xtor = HttpExtractor(s) curi = xtor(curi) self.assertFalse(CURI_EXTRACTED_URLS in curi.optional_vars)
def test_relative_links(self): s = Settings() curi = CrawlUri("http://localhost") curi.status_code = 303 curi.rep_header = {"Location": "/index.html"} curi.optional_vars = dict() xtor = HttpExtractor(s) curi = xtor(curi) self.assertTrue(CURI_EXTRACTED_URLS in curi.optional_vars) self.assertEquals("http://localhost/index.html", curi.optional_vars[CURI_EXTRACTED_URLS])
def test_that_with_uri_works(self): s = StripSessionIds(Settings()) urls = ["http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2", ] curi = CrawlUri() curi.optional_vars = { CURI_EXTRACTED_URLS: "\n".join(urls) } curi = s(curi) clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n') for u in clean_urls: self.assertEqual("http://preis.de/traeger/index.php?", u)
def test_that_with_uri_works(self): s = StripSessionIds(Settings()) urls = [ "http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2", "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2", ] curi = CrawlUri() curi.optional_vars = {CURI_EXTRACTED_URLS: "\n".join(urls)} curi = s(curi) clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n') for u in clean_urls: self.assertEqual("http://preis.de/traeger/index.php?", u)
def test_missing_encoding_works(self): src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \ "<a title='ups i did it again' href ='/relative.html'>und " + \ "noch mehr!</a><a href='evenmorerelative.html'>" curi = CrawlUri() curi.rep_header = dict() curi.rep_header["Content-Type"] = "text/html" curi.url = "http://www.bmg.bund.de/test/" curi.content_body = src curi.optional_vars = dict() xtor = DefaultHtmlLinkExtractor(Settings()) curi = xtor(curi) links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n") self.assertEqual("http://www.google.de", links[0]) self.assertEqual("http://www.bmg.bund.de/relative.html", links[1]) self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html", links[2])
def test_regex_scoper(self): curi = CrawlUri() curi.optional_vars = dict() curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([ "http://www.google.de/index.html", "ftp://www.google.de/pillepalle.avi", ]) settings = Settings() settings.REGEX_SCOPE_POSITIVE = ['^.*\.html'] settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi'] scoper = RegexScoper(settings) curi = scoper(curi) print curi.optional_vars[CURI_EXTRACTED_URLS] self.assertTrue("http://www.google.de/index.html" in curi.optional_vars[CURI_EXTRACTED_URLS]) self.assertFalse("ftp://www.google.de/pillepalle.avi" in curi.optional_vars[CURI_EXTRACTED_URLS])
def test_link_extraction_works(self): src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \ "<a title='ups i did it again' href ='/relative.html'>und " + \ "noch mehr!</a><a href='evenmorerelative.html'/>" + \ "<a href='mailto:muster@bfarm.de'/>" curi = CrawlUri() curi.rep_header = dict() curi.rep_header["Content-Type"] = "text/html; charset=utf-8" curi.url = "http://www.bmg.bund.de/test/" curi.content_body = src curi.optional_vars = dict() xtor = DefaultHtmlLinkExtractor(Settings()) curi = xtor(curi) links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n") self.assertEqual("http://www.google.de", links[0]) self.assertEqual("http://www.bmg.bund.de/relative.html", links[1]) self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html", links[2])
def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi