示例#1
0
    def test_do_not_process_robots_works(self):

        curi = CrawlUri()
        curi.effective_url = "http://127.0.0.1/robots.txt"
        curi.optional_vars = dict()

        l = limiter.DefaultLimiter(None)

        for i in range(2):
            l._do_not_process_robots(curi)
            self.assertEqual(CURI_OPTIONAL_TRUE,
                             curi.optional_vars[CURI_EXTRACTION_FINISHED])
示例#2
0
    def test_only_on_redirect(self):

        s = Settings()

        curi = CrawlUri("http://localhost")
        curi.status_code = 200
        curi.rep_header = {"Location": "http://localhost/index.html"}
        curi.optional_vars = dict()

        xtor = HttpExtractor(s)
        curi = xtor(curi)

        self.assertFalse(CURI_EXTRACTED_URLS in curi.optional_vars)
示例#3
0
    def test_relative_links(self):

        s = Settings()

        curi = CrawlUri("http://localhost")
        curi.status_code = 303
        curi.rep_header = {"Location": "/index.html"}
        curi.optional_vars = dict()

        xtor = HttpExtractor(s)
        curi = xtor(curi)

        self.assertTrue(CURI_EXTRACTED_URLS in curi.optional_vars)
        self.assertEquals("http://localhost/index.html", curi.optional_vars[CURI_EXTRACTED_URLS])
示例#4
0
    def test_that_with_uri_works(self):

        s = StripSessionIds(Settings())

        urls = ["http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
        ]

        curi = CrawlUri()
        curi.optional_vars = { CURI_EXTRACTED_URLS: "\n".join(urls) }

        curi = s(curi)
        clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n')

        for u in clean_urls:
            self.assertEqual("http://preis.de/traeger/index.php?", u)
示例#5
0
    def test_that_with_uri_works(self):

        s = StripSessionIds(Settings())

        urls = [
            "http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
        ]

        curi = CrawlUri()
        curi.optional_vars = {CURI_EXTRACTED_URLS: "\n".join(urls)}

        curi = s(curi)
        clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n')

        for u in clean_urls:
            self.assertEqual("http://preis.de/traeger/index.php?", u)
    def test_missing_encoding_works(self):
        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                         links[2])
    def test_missing_encoding_works(self):
        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                links[2])
示例#8
0
    def test_regex_scoper(self):

        curi = CrawlUri()
        curi.optional_vars = dict()
        curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([
            "http://www.google.de/index.html",
            "ftp://www.google.de/pillepalle.avi",
        ])

        settings = Settings()
        settings.REGEX_SCOPE_POSITIVE = ['^.*\.html']
        settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi']
        scoper = RegexScoper(settings)

        curi = scoper(curi)

        print curi.optional_vars[CURI_EXTRACTED_URLS]
        self.assertTrue("http://www.google.de/index.html" in
                curi.optional_vars[CURI_EXTRACTED_URLS])
        self.assertFalse("ftp://www.google.de/pillepalle.avi" in
                curi.optional_vars[CURI_EXTRACTED_URLS])
示例#9
0
    def test_regex_scoper(self):

        curi = CrawlUri()
        curi.optional_vars = dict()
        curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([
            "http://www.google.de/index.html",
            "ftp://www.google.de/pillepalle.avi",
        ])

        settings = Settings()
        settings.REGEX_SCOPE_POSITIVE = ['^.*\.html']
        settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi']
        scoper = RegexScoper(settings)

        curi = scoper(curi)

        print curi.optional_vars[CURI_EXTRACTED_URLS]
        self.assertTrue("http://www.google.de/index.html" in
                        curi.optional_vars[CURI_EXTRACTED_URLS])
        self.assertFalse("ftp://www.google.de/pillepalle.avi" in
                         curi.optional_vars[CURI_EXTRACTED_URLS])
    def test_link_extraction_works(self):

        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'/>" + \
            "<a href='&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#109;&#117;&#115;&#116;&#101;&#114;&#64;&#98;&#102;&#97;&#114;&#109;&#46;&#100;&#101;'/>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html; charset=utf-8"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                         links[2])
    def test_link_extraction_works(self):

        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'/>" + \
            "<a href='&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#109;&#117;&#115;&#116;&#101;&#114;&#64;&#98;&#102;&#97;&#114;&#109;&#46;&#100;&#101;'/>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html; charset=utf-8"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                links[2])
示例#12
0
文件: frontier.py 项目: mt3/Spyder
    def _crawluri_from_uri(self, uri):
        """
        Convert an URI tuple to a :class:`CrawlUri`.

        Replace the hostname with the real IP in order to cache DNS queries.
        """
        (url, etag, mod_date, _next_date, prio) = uri

        parsed_url = urlparse(url)

        # dns resolution and caching
        port = parsed_url.port
        if not port:
            port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme]

        effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname,
            port)]

        curi = CrawlUri(url)
        curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" %
                effective_netloc)
        curi.current_priority = prio
        curi.req_header = dict()
        if etag:
            curi.req_header["Etag"] = etag
        if mod_date:
            mod_date_time = datetime.fromtimestamp(mod_date)
            curi.req_header["Last-Modified"] = serialize_date_time(
                    mod_date_time)

        curi.optional_vars = dict()
        if parsed_url.username and parsed_url.password:
            curi.optional_vars[CURI_SITE_USERNAME] = \
                parsed_url.username.encode()
            curi.optional_vars[CURI_SITE_PASSWORD] = \
                parsed_url.password.encode()

        return curi
示例#13
0
    def _crawluri_from_uri(self, uri):
        """
        Convert an URI tuple to a :class:`CrawlUri`.

        Replace the hostname with the real IP in order to cache DNS queries.
        """
        (url, etag, mod_date, _next_date, prio) = uri

        parsed_url = urlparse(url)

        # dns resolution and caching
        port = parsed_url.port
        if not port:
            port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme]

        effective_netloc = self._dns_cache["%s:%s" %
                                           (parsed_url.hostname, port)]

        curi = CrawlUri(url)
        curi.effective_url = url.replace(parsed_url.netloc,
                                         "%s:%s" % effective_netloc)
        curi.current_priority = prio
        curi.req_header = dict()
        if etag:
            curi.req_header["Etag"] = etag
        if mod_date:
            mod_date_time = datetime.fromtimestamp(mod_date)
            curi.req_header["Last-Modified"] = serialize_date_time(
                mod_date_time)

        curi.optional_vars = dict()
        if parsed_url.username and parsed_url.password:
            curi.optional_vars[CURI_SITE_USERNAME] = \
                parsed_url.username.encode()
            curi.optional_vars[CURI_SITE_PASSWORD] = \
                parsed_url.password.encode()

        return curi