def test_base_url(self):
        reqx = BaseSgmlRequestExtractor()

        html = """<html><head><title>Page title<title>
        <base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("https://example.org/p/index.html", body=html)
        reqs = reqx.extract_requests(response)
        self.failUnless(self._requests_equals( \
            [Request('http://otherdomain.com/base/item/12.html', \
                    meta={'link_text': 'Item 12'})], reqs), reqs)

        # base url is an absolute path and relative to host
        html = """<html><head><title>Page title<title>
        <base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("https://example.org/p/index.html", body=html)
        reqs = reqx.extract_requests(response)
        self.failUnless(self._requests_equals( \
            [Request('https://example.org/item/12.html', \
                    meta={'link_text': 'Item 12'})], reqs), reqs)

        # base url has no scheme
        html = """<html><head><title>Page title<title>
        <base href="//noscheme.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("https://example.org/p/index.html", body=html)
        reqs = reqx.extract_requests(response)
        self.failUnless(self._requests_equals( \
            [Request('https://noscheme.com/base/item/12.html', \
                    meta={'link_text': 'Item 12'})], reqs), reqs)
    def test_basic(self):
        base_url = 'http://example.org/somepage/index.html'
        html = """<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/" /></p></body></html>"""
        requests = [
            Request('http://example.org/somepage/item/12.html',
                    meta={'link_text': 'Item 12'}),
            Request('http://example.org/about.html',
                    meta={'link_text': 'About us'}),
            Request('http://example.org/othercat.html',
                    meta={'link_text': 'Other category'}),
            Request('http://example.org/',
                    meta={'link_text': ''}),
            ]

        response = HtmlResponse(base_url, body=html)
        reqx = BaseSgmlRequestExtractor() # default: tag=a, attr=href

        self.failUnless(
            self._requests_equals(requests, reqx.extract_requests(response))
            )
    def test_base_url(self):
        reqx = BaseSgmlRequestExtractor()

        html = """<html><head><title>Page title<title>
        <base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("https://example.org/p/index.html", body=html)
        reqs = reqx.extract_requests(response)
        self.failUnless(self._requests_equals( \
            [Request('http://otherdomain.com/base/item/12.html', \
                    meta={'link_text': 'Item 12'})], reqs), reqs)

        # base url is an absolute path and relative to host
        html = """<html><head><title>Page title<title>
        <base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("https://example.org/p/index.html", body=html)
        reqs = reqx.extract_requests(response)
        self.failUnless(self._requests_equals( \
            [Request('https://example.org/item/12.html', \
                    meta={'link_text': 'Item 12'})], reqs), reqs)

        # base url has no scheme
        html = """<html><head><title>Page title<title>
        <base href="//noscheme.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("https://example.org/p/index.html", body=html)
        reqs = reqx.extract_requests(response)
        self.failUnless(self._requests_equals( \
            [Request('https://noscheme.com/base/item/12.html', \
                    meta={'link_text': 'Item 12'})], reqs), reqs)
    def test_base_url(self):
        html = """<html><head><title>Page title<title>
        <base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html",
                    body=html)
        reqx = BaseSgmlRequestExtractor()

        self.failUnless(
            self._requests_equals(reqx.extract_requests(response),
                    [ Request('http://otherdomain.com/base/item/12.html',
                             meta={'link_text': 'Item 12'}) ]
                    )
            )
    def test_extraction_encoding(self):
        #TODO: use own fixtures
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body,
                        headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                            body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                            body=body)

        reqx = BaseSgmlRequestExtractor()
        self.failUnless(
            self._requests_equals(
                reqx.extract_requests(response_utf8),
                [ Request(url='http://example.com/sample_%C3%B1.html',
                          meta={'link_text': ''}),
                  Request(url='http://example.com/sample_%E2%82%AC.html',
                          meta={'link_text':
                                'sample \xe2\x82\xac text'.decode('utf-8')}) ]
                )
            )

        self.failUnless(
            self._requests_equals(
                reqx.extract_requests(response_noenc),
                [ Request(url='http://example.com/sample_%C3%B1.html',
                          meta={'link_text': ''}),
                  Request(url='http://example.com/sample_%E2%82%AC.html',
                          meta={'link_text':
                                'sample \xe2\x82\xac text'.decode('utf-8')}) ]
                )
            )

        self.failUnless(
            self._requests_equals(
                reqx.extract_requests(response_latin1),
                [ Request(url='http://example.com/sample_%F1.html',
                          meta={'link_text': ''}),
                  Request(url='http://example.com/sample_%E1.html',
                          meta={'link_text':
                                'sample \xe1 text'.decode('latin1')}) ]
                )
            )
    def test_extraction_encoding(self):
        #TODO: use own fixtures
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        reqx = BaseSgmlRequestExtractor()
        self.failUnless(
            self._requests_equals(reqx.extract_requests(response_utf8), [
                Request(url='http://example.com/sample_%C3%B1.html',
                        meta={'link_text': ''}),
                Request(url='http://example.com/sample_%E2%82%AC.html',
                        meta={
                            'link_text':
                            'sample \xe2\x82\xac text'.decode('utf-8')
                        })
            ]))

        self.failUnless(
            self._requests_equals(reqx.extract_requests(response_noenc), [
                Request(url='http://example.com/sample_%C3%B1.html',
                        meta={'link_text': ''}),
                Request(url='http://example.com/sample_%E2%82%AC.html',
                        meta={
                            'link_text':
                            'sample \xe2\x82\xac text'.decode('utf-8')
                        })
            ]))

        self.failUnless(
            self._requests_equals(reqx.extract_requests(response_latin1), [
                Request(url='http://example.com/sample_%F1.html',
                        meta={'link_text': ''}),
                Request(
                    url='http://example.com/sample_%E1.html',
                    meta={'link_text': 'sample \xe1 text'.decode('latin1')})
            ]))
    def test_basic(self):
        base_url = 'http://example.org/somepage/index.html'
        html = """<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/" /></p></body></html>"""
        requests = [
            Request('http://example.org/somepage/item/12.html',
                    meta={'link_text': 'Item 12'}),
            Request('http://example.org/about.html',
                    meta={'link_text': 'About us'}),
            Request('http://example.org/othercat.html',
                    meta={'link_text': 'Other category'}),
            Request('http://example.org/', meta={'link_text': ''}),
        ]

        response = HtmlResponse(base_url, body=html)
        reqx = BaseSgmlRequestExtractor()  # default: tag=a, attr=href

        self.failUnless(
            self._requests_equals(requests, reqx.extract_requests(response)))