Exemplo n.º 1
0
    def test_client_did_not_complete(self):
        client = Client()

        with warnings.catch_warnings(record=True) as warn_list:
            warnings.simplefilter("always")

            with client.session() as session:
                request = Request(self.get_url('/'))
                yield from session.start(request)
                self.assertFalse(session.done())

            for warn_obj in warn_list:
                print(warn_obj)

            # Unrelated warnings may occur in PyPy
            # https://travis-ci.org/chfoo/wpull/jobs/51420202
            self.assertGreaterEqual(len(warn_list), 1)

            for warn_obj in warn_list:
                if str(warn_obj.message) == 'HTTP session did not complete.':
                    break
            else:
                self.fail('Warning did not occur.')

        client = Client()

        with self.assertRaises(MyException):
            with client.session() as session:
                request = Request(self.get_url('/'))
                yield from session.start(request)
                raise MyException('Oops')
Exemplo n.º 2
0
    def test_css_detect(self):
        self.assertTrue(
            CSSReader.is_file(
                io.BytesIO('body { color: white }'.encode('utf-16le'))))
        self.assertFalse(
            CSSReader.is_file(io.BytesIO('hello world!'.encode('utf-16le'))))
        self.assertFalse(CSSReader.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(
            CSSReader.is_file(io.BytesIO(b'<html><body>hello')) is VeryFalse)
        self.assertTrue(
            CSSReader.is_file(io.BytesIO(b'h1 { background-color: red }')))
        self.assertTrue(CSSReader.is_file(io.BytesIO(b'@import url.css;')))
        self.assertTrue(
            CSSReader.is_url(URLInfo.parse('example.com/index.css')))
        self.assertFalse(
            CSSReader.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(CSSReader.is_request(Request('example.com/index.css')))
        self.assertFalse(CSSReader.is_request(
            Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/css'
        self.assertTrue(CSSReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(CSSReader.is_response(response))
Exemplo n.º 3
0
    def test_basic_requests(self):
        proxy_http_client = Client()
        proxy_server = HTTPProxyServer(proxy_http_client)
        proxy_socket, proxy_port = tornado.testing.bind_unused_port()

        yield from asyncio.start_server(proxy_server, sock=proxy_socket)

        connection_pool = HTTPProxyConnectionPool(('127.0.0.1', proxy_port))
        http_client = Client(connection_pool=connection_pool)

        for dummy in range(3):
            with http_client.session() as session:
                response = yield from session.start(Request(self.get_url('/')))
                self.assertEqual(200, response.status_code)

                file = io.BytesIO()
                yield from session.download(file=file)
                data = file.getvalue().decode('ascii', 'replace')
                self.assertTrue(data.endswith('</html>'))

            with http_client.session() as session:
                response = yield from session.start(
                    Request(self.get_url('/always_error')))
                self.assertEqual(500, response.status_code)
                self.assertEqual('Dragon In Data Center', response.reason)

                file = io.BytesIO()
                yield from session.download(file=file)
                data = file.getvalue().decode('ascii', 'replace')
                self.assertEqual('Error', data)
Exemplo n.º 4
0
    def test_http_request(self):
        request = Request('http://example.com')
        request.fields['hello'] = 'world'
        new_request = convert_http_request(request)

        self.assertEqual('example.com', new_request.host)
        self.assertEqual('world', new_request.get_header('Hello'))
Exemplo n.º 5
0
    def test_xml_detect(self):
        self.assertTrue(
            XMLDetector.is_file(io.BytesIO(
                '<?xml version='.encode('utf-16le'))))
        self.assertFalse(
            XMLDetector.is_file(
                io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le'))))
        self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version')))
        self.assertTrue(
            XMLDetector.is_url(URLInfo.parse('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(
            XMLDetector.is_request(Request('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_request(Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'application/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(XMLDetector.is_response(response))
Exemplo n.º 6
0
    def test_redirect_loop(self):
        checker = RobotsTxtChecker(web_client=MockWebClient())
        request = Request('http://example.com')
        request.prepare_for_send()

        nonlocal_dict = {'counter': 0}

        def response_callback(request):
            request.prepare_for_send()
            self.assertTrue(request.url_info.url.endswith('robots.txt'))
            response = Response(302, 'See else')
            response.request = request
            response.fields['Location'] = '/robots.txt'

            nonlocal_dict['counter'] += 1

            if nonlocal_dict['counter'] > 20:
                raise ProtocolError('Mock redirect loop error.')

            return response

        checker.web_client.mock_response_callback = response_callback

        self.assertTrue((yield from checker.can_fetch(request)))
        self.assertTrue(checker.can_fetch_pool(request))
Exemplo n.º 7
0
 def test_connection_reuse(self):
     stream = self.new_stream()
     request = Request(self.get_url('/'))
     request.version = 'HTTP/1.0'
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
Exemplo n.º 8
0
 def test_connection_reuse(self):
     stream = self.new_stream()
     request = Request(self.get_url('/'))
     request.version = 'HTTP/1.0'
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
Exemplo n.º 9
0
 def test_request(self):
     request = Request('http://example.com/robots.txt')
     request.prepare_for_send()
     self.assertEqual(
         (b'GET /robots.txt HTTP/1.1\r\n'
          b'Host: example.com\r\n'
          b'\r\n'),
         request.to_bytes()
     )
Exemplo n.º 10
0
    def test_fetch_allow_redirects(self):
        checker = RobotsTxtChecker(web_client=MockWebClient())
        request = Request('http://example.com')
        request.prepare_for_send()

        # Try fetch example.com/ (need robots.txt)
        def response_callback_1(request):
            request.prepare_for_send()
            self.assertEqual('http://example.com/robots.txt',
                             request.url_info.url)

            response = Response(301, 'Moved')
            response.fields['location'] = 'http://www.example.com/robots.txt'
            response.request = request

            checker.web_client.mock_response_callback = response_callback_2
            checker.web_client.request = Request(
                'http://www.example.com/robots.txt')

            return response

        # Try fetch www.example.com/robots.txt
        def response_callback_2(request):
            request.prepare_for_send()
            self.assertEqual('http://www.example.com/robots.txt',
                             request.url_info.url)

            response = Response(301, 'Moved')
            response.fields['location'] = 'http://www.example.net/robots.txt'
            response.request = request

            checker.web_client.mock_response_callback = response_callback_3
            checker.web_client.request = Request(
                'http://www.example.net/robots.txt')

            return response

        # Try fetch www.example.net/robots.txt
        def response_callback_3(request):
            request.prepare_for_send()
            self.assertEqual('http://www.example.net/robots.txt',
                             request.url_info.url)

            response = Response(200, 'OK')
            response.request = request
            response.body = io.StringIO('User-agent:*\nAllow: /\n')

            checker.web_client.session_obj.done_value = True
            return response

        checker.web_client.mock_response_callback = response_callback_1

        self.assertTrue((yield from checker.can_fetch(request)))
        self.assertTrue(checker.can_fetch_pool(request))
Exemplo n.º 11
0
    def test_overrun(self):
        stream = self.new_stream()
        request = Request(self.get_url('/overrun'))

        for dummy in range(3):
            response, content = yield from self.fetch(stream, request)

            self.assertEqual(b'a' * 100, content)

        request = Request(self.get_url('/'))
        yield from self.fetch(stream, request)
Exemplo n.º 12
0
    def test_header_early_close(self):
        stream = self.new_stream()
        request = Request(self.get_url('/header_early_close'))
        try:
            yield from self.fetch(stream, request)
        except NetworkError:
            pass
        else:
            self.fail()  # pragma: no cover

        request = Request(self.get_url('/'))
        yield from self.fetch(stream, request)
Exemplo n.º 13
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request('example.com/image.jpg'))
        )

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
Exemplo n.º 14
0
    def test_to_dict_body(self):
        request = Request()
        request.body = Body()
        request_dict = request.to_dict()

        self.assertTrue(request_dict['body'])
        request.body.close()

        request = Request()
        request.body = NotImplemented
        request_dict = request.to_dict()

        self.assertFalse(request_dict['body'])

        response = Response()
        response.body = Body()
        response_dict = response.to_dict()

        self.assertTrue(response_dict['body'])
        response.body.close()

        response = Response()
        response.body = NotImplemented
        response_dict = response.to_dict()

        self.assertFalse(response_dict['body'])
Exemplo n.º 15
0
    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                max_size=1,
                appending=True
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
Exemplo n.º 16
0
    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())

        class BadRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                for dummy in range(1000):
                    yield b"where's my elephant?"
                raise OSError('Oops')

        session._request_record = BadRecord(session._request_record)
        original_offset = os.path.getsize(warc_filename)

        with self.assertRaises((OSError, IOError)):
            session.end_request(request)

        new_offset = os.path.getsize(warc_filename)
        self.assertEqual(new_offset, original_offset)
        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

        _logger.debug('original offset {0}'.format(original_offset))
Exemplo n.º 17
0
    def test_javascript_heavy_inline_monstrosity(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/test.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                in_file.seek(0x147)
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png', inline_urls)
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls)

        print('\n'.join(inline_urls))
        print('\n'.join(linked_urls))
Exemplo n.º 18
0
    def test_client_duration_timeout(self):
        client = Client()

        with self.assertRaises(DurationTimeout), client.session() as session:
            request = Request(self.get_url('/sleep_long'))
            yield from session.start(request)
            yield from session.download(duration_timeout=0.1)
Exemplo n.º 19
0
    def test_html_scraper_links_base_href(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'basehref.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-8', scrape_result.encoding)

        self.assertEqual(
            {
                'http://cdn.example.com/stylesheet1.css',
                'http://www.example.com/stylesheet2.css',
                'http://example.com/a/stylesheet3.css',
                'http://example.com/a/dir/image1.png',
                'http://example.com/dir/image2.png',
                'http://example.net/image3.png',
                'http://example.com/dir/image4.png',
            }, inline_urls)
        self.assertEqual({'http://example.com/a/'}, linked_urls)
Exemplo n.º 20
0
    def test_rss_as_html(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(inline_urls)
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            }, linked_urls)
Exemplo n.º 21
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)
Exemplo n.º 22
0
    def test_html_soup(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls)
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            }, linked_urls)
Exemplo n.º 23
0
    def test_duration_timeout(self):
        client = WebClient()
        session = client.session(Request(self.get_url('/sleep_long')))

        with self.assertRaises(DurationTimeout):
            yield from session.start()
            yield from session.download(duration_timeout=0.1)
Exemplo n.º 24
0
 def test_content_length_and_chunked(self):
     stream = self.new_stream()
     request = Request(self.get_url('/content_length_and_chunked'))
     response, content = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('chunked', response.fields['Transfer-Encoding'])
     self.assertEqual(b'hello world!', content)
Exemplo n.º 25
0
 def test_utf8_header(self):
     stream = self.new_stream()
     request = Request(self.get_url('/utf8_header'))
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('🐱'.encode('utf-8').decode('latin-1'),
                      response.fields['whoa'])
Exemplo n.º 26
0
    def test_false_gzip(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/false_gzip'))
        response, content = yield from self.fetch(stream, request)

        self.assertEqual('gzip', response.fields['Content-Encoding'])
        self.assertEqual(b'a' * 100, content)
Exemplo n.º 27
0
    def test_status_line_only(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/status_line_only'))
        response, content = yield from self.fetch(stream, request)

        self.assertEqual(200, response.status_code)
        self.assertEqual(b'Hey', content)
Exemplo n.º 28
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
        }, linked_urls)
        self.assertFalse(inline_urls)
Exemplo n.º 29
0
    def end_request(self, request: HTTPRequest):
        payload_offset = len(request.to_bytes())

        self._request_record.block_file.seek(0)
        self._recorder.set_length_and_maybe_checksums(
            self._request_record, payload_offset=payload_offset)
        self._recorder.write_record(self._request_record)
Exemplo n.º 30
0
    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(compress=False,
                                      max_size=1,
                                      appending=True),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
Exemplo n.º 31
0
    def test_client_exception_recovery(self):
        connection_factory = functools.partial(Connection, timeout=2.0)
        connection_pool = ConnectionPool(connection_factory=connection_factory)
        client = Client(connection_pool=connection_pool)

        for dummy in range(7):
            with self.assertRaises(NetworkError), client.session() as session:
                request = Request(self.get_url('/header_early_close'))
                yield from session.start(request)

        for dummy in range(7):
            with client.session() as session:
                request = Request(self.get_url('/'))
                response = yield from session.start(request)
                self.assertEqual(200, response.status_code)
                yield from session.download()
                self.assertTrue(session.done())
Exemplo n.º 32
0
    def end_request(self, request: HTTPRequest):
        payload_offset = len(request.to_bytes())

        self._request_record.block_file.seek(0)
        self._recorder.set_length_and_maybe_checksums(
            self._request_record, payload_offset=payload_offset
        )
        self._recorder.write_record(self._request_record)
Exemplo n.º 33
0
 def test_basic_chunked_trailer(self):
     stream = self.new_stream()
     request = Request(self.get_url('/chunked_trailer'))
     response, content = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('chunked', response.fields['Transfer-Encoding'])
     self.assertEqual('dolphin', response.fields['Animal'])
     self.assertEqual(b'hello world!', content)
Exemplo n.º 34
0
 def test_connection_refused(self):
     stream = self.new_stream('127.0.0.1', 1)
     try:
         yield from self.fetch(stream, Request('http://localhost:1/'))
     except ConnectionRefused:
         pass
     else:
         self.fail()  # pragma: no cover
Exemplo n.º 35
0
 def test_basic_content_length(self):
     stream = self.new_stream()
     request = Request(self.get_url('/content_length'))
     response, content = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('100', response.fields['Content-Length'])
     self.assertEqual(b'a' * 100, content)
     self.assertEqual(100, len(content))
Exemplo n.º 36
0
    def _add_referrer(cls, request: Request, url_record: URLRecord):
        '''Add referrer URL to request.'''
        # Prohibit leak of referrer from HTTPS to HTTP
        # rfc7231 section 5.5.2.
        if url_record.parent_url.startswith('https://') and \
                url_record.url_info.scheme == 'http':
            return

        request.fields['Referer'] = url_record.parent_url
Exemplo n.º 37
0
    def test_warc_recorder_journal(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        test_instance = self

        class MockRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                print(list(os.walk('.')))
                test_instance.assertTrue(
                    os.path.exists(warc_filename + '-wpullinc')
                )

                for dummy in range(1000):
                    yield b"where's my elephant?"

        session._request_record = MockRecord(session._request_record)

        session.end_request(request)

        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
Exemplo n.º 38
0
    def _add_basic_auth_header(self, request: Request):
        username = request.url_info.username or request.username
        password = request.url_info.password or request.password

        if username and password:
            _logger.debug("Add basic auth header")

            auth_string = "{}:{}".format(username, password)
            auth_string = base64.b64encode(auth_string.encode("utf-8", "replace")).decode("utf-8")
            request.fields["Authorization"] = "Basic {}".format(auth_string)
Exemplo n.º 39
0
    def _read_request_header(self) -> Request:
        request = Request()

        for dummy in range(100):
            line = yield from self._reader.readline()

            _logger.debug(__('Got line {0}', line))

            if line[-1:] != b'\n':
                return

            if not line.strip():
                break

            request.parse(line)
        else:
            raise ProtocolError('Request has too many headers.')

        return request
Exemplo n.º 40
0
    def _add_post_data(self, request: Request):
        '''Add data to the payload.'''
        if self._item_session.url_record.post_data:
            data = wpull.string.to_bytes(self._item_session.url_record.post_data)
        else:
            data = wpull.string.to_bytes(
                self._processor.fetch_params.post_data
            )

        request.method = 'POST'
        request.fields['Content-Type'] = 'application/x-www-form-urlencoded'
        request.fields['Content-Length'] = str(len(data))

        _logger.debug('Posting with data {0}.', data)

        if not request.body:
            request.body = Body(io.BytesIO())

        with wpull.util.reset_file_offset(request.body):
            request.body.write(data)
Exemplo n.º 41
0
    def start(self, request: Request) -> Response:
        '''Begin a HTTP request

        Args:
            request: Request information.

        Returns:
            A response populated with the HTTP headers.

        Once the headers are received, call :meth:`download`.

        Coroutine.
        '''
        if self._session_state != SessionState.ready:
            raise RuntimeError('Session already started')

        assert not self._request
        self._request = request
        _logger.debug(__('Client fetch request {0}.', request))

        connection = yield from self._acquire_request_connection(request)
        full_url = connection.proxied and not connection.tunneled

        self._stream = stream = self._stream_factory(connection)

        yield from self._stream.reconnect()

        request.address = connection.address

        self.event_dispatcher.notify(self.Event.begin_request, request)
        write_callback = functools.partial(self.event_dispatcher.notify, self.Event.request_data)
        stream.data_event_dispatcher.add_write_listener(write_callback)

        yield from stream.write_request(request, full_url=full_url)

        if request.body:
            assert 'Content-Length' in request.fields
            length = int(request.fields['Content-Length'])
            yield from stream.write_body(request.body, length=length)

        stream.data_event_dispatcher.remove_write_listener(write_callback)
        self.event_dispatcher.notify(self.Event.end_request, request)

        read_callback = functools.partial(self.event_dispatcher.notify, self.Event.response_data)
        stream.data_event_dispatcher.add_read_listener(read_callback)

        self._response = response = yield from stream.read_response()
        response.request = request

        self.event_dispatcher.notify(self.Event.begin_response, response)

        self._session_state = SessionState.request_sent

        return response
Exemplo n.º 42
0
Arquivo: web.py Projeto: chfoo/wpull
    def _add_basic_auth_header(self, request: Request):
        username = request.url_info.username or request.username
        password = request.url_info.password or request.password

        if username and password:
            _logger.debug('Add basic auth header')

            auth_string = '{}:{}'.format(username, password)
            auth_string = base64.b64encode(
                auth_string.encode('utf-8', 'replace')).decode('utf-8')
            request.fields['Authorization'] = 'Basic {}'.format(auth_string)
Exemplo n.º 43
0
    def test_to_dict(self):
        request = Request('https://foofle.com')
        request_dict = request.to_dict()

        self.assertEqual('https://foofle.com', request_dict['url'])
        self.assertEqual('https', request_dict['url_info']['scheme'])
        self.assertEqual('GET', request_dict['method'])
        self.assertEqual('http', request_dict['protocol'])

        response = Response(status_code=200, reason='OK', request=request)
        response_dict = response.to_dict()

        self.assertEqual(
            'https://foofle.com',
            response_dict['request']['url']
        )
        self.assertEqual('http', response_dict['protocol'])
        self.assertEqual(200, response_dict['status_code'])
        self.assertEqual(200, response_dict['response_code'])
        self.assertEqual('OK', response_dict['reason'])
        self.assertEqual('OK', response_dict['response_message'])
Exemplo n.º 44
0
    def test_to_dict_body(self):
        request = Request()
        request.body = Body()
        request_dict = request.to_dict()

        self.assertTrue(request_dict['body'])
        request.body.close()

        request = Request()
        request.body = NotImplemented
        request_dict = request.to_dict()

        self.assertFalse(request_dict['body'])

        response = Response()
        response.body = Body()
        response_dict = response.to_dict()

        self.assertTrue(response_dict['body'])
        response.body.close()

        response = Response()
        response.body = NotImplemented
        response_dict = response.to_dict()

        self.assertFalse(response_dict['body'])
Exemplo n.º 45
0
    def test_warc_move_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        os.mkdir('./blah/')

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                cdx=True,
                move_to='./blah/',
                max_size=1,
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('./blah/asdf-00000.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-00001.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-meta.warc'))
        self.assertTrue(os.path.exists('./blah/' + cdx_filename))
Exemplo n.º 46
0
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder(
            'asdf',
            params=WARCRecorderParams(
                compress=False, cdx=True, url_table=url_table
            )
        )

        url_table.add_visits([
            (
                'http://example.com/fennec',
                '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
                'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ'
            )
        ])

        request = HTTPRequest('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()
        revisit_response_header_size = len(response.to_bytes())

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content
        )
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n',
            warc_file_content
        )
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/fennec\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content
        )
        self.assertEqual(
            1,
            warc_file_content.count(b'kitbit')
        )

        self.assertIn(b'http://example.com/horse ', cdx_file_content)
Exemplo n.º 47
0
    def _process_request(self, request: Request):
        _logger.debug(__('Got request {0}', request))

        if request.method == 'CONNECT':
            self._reject_request('CONNECT is intentionally not supported')
            return

        if self._is_ssl_tunnel and request.url.startswith('http://'):
            # Since we are spying under a SSL tunnel, assume processed requests
            # are SSL
            request.url = request.url.replace('http://', 'https://', 1)

        if 'Upgrade' in request.fields.get('Connection', ''):
            _logger.warning(__(
                _('Connection Upgrade not supported for {}'),
                request.url
            ))
            self._reject_request('Upgrade not supported')
            return

        _logger.debug('Begin response.')

        try:
            action = self.hook_dispatcher.call(self.Event.client_request, request)
        except HookDisconnected:
            pass
        else:
            if not action:
                _logger.debug('Proxy force reject request')
                self._reject_request()
                return

        with self._http_client.session() as session:
            if 'Content-Length' in request.fields:
                request.body = self._reader

            try:
                response = yield from session.start(request)
            except NetworkError as error:
                _logger.debug('Upstream error', exc_info=True)
                self._write_error_response()
                self.event_dispatcher.notify(self.Event.server_response_error, error)
                return

            response.body = Body()

            try:
                action = self.hook_dispatcher.call(self.Event.server_begin_response, response)
            except HookDisconnected:
                pass
            else:
                if not action:
                    _logger.debug('Proxy force reject request via response')
                    self._reject_request()
                    return

            try:
                self._writer.write(response.to_bytes())
                yield from self._writer.drain()

                session.event_dispatcher.add_listener(
                    Session.Event.response_data,
                    self._writer.write
                )

                yield from session.download(file=response.body, raw=True)

                yield from self._writer.drain()
            except NetworkError as error:
                _logger.debug('Upstream error', exc_info=True)
                self.event_dispatcher.notify(self.Event.server_response_error, error)
                raise

            self.event_dispatcher.notify(self.Event.server_end_response, response)

        _logger.debug('Response done.')
Exemplo n.º 48
0
    def test_copy(self):
        request = Request('http://twitcharchivestheinternet.invalid/')

        # Cheeck for no crash
        request.copy()
Exemplo n.º 49
0
    def test_request_parse(self):
        request = Request()
        request.parse(b'GET /robots.txt HTTP/1.1\r\n')
        request.parse(b'Host: example.com\r\n')
        request.parse('Accept: éxample\r\n'.encode('utf_8'))
        request.parse(b'\r\n')

        self.assertEqual('http://example.com/robots.txt', request.url)
        self.assertEqual('example.com', request.fields['host'])
        self.assertEqual('éxample'.encode('utf-8').decode('latin-1'),
                         request.fields['accept'])

        request = Request()
        request.parse(b'GET https://example.com/robots.txt HTTP/1.1\r\n')
        request.parse(b'Host: example.com\r\n')
        request.parse(b'Accept: \xffexample\r\n')
        request.parse(b'\r\n')

        self.assertEqual('https://example.com/robots.txt', request.url)
        self.assertEqual('example.com', request.fields['host'])
        self.assertEqual('\xffexample', request.fields['accept'])
Exemplo n.º 50
0
    def test_warc_recorder(self):
        file_prefix = 'asdf'
        warc_filename = 'asdf.warc'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                extra_fields=[('Extra-field', 'my_extra_field')],
                cdx=True,
            ),
        )

        request = HTTPRequest('http://example.com/')
        request.prepare_for_send()
        request.address = ('0.0.0.0', 80)
        request.prepare_for_send()
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open(warc_filename, 'rb') as in_file:
            warc_file_content = in_file.read()

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content)
        self.assertIn(b'Content-Type: application/warc-fields',
                      warc_file_content)
        self.assertIn(b'WARC-Date: ', warc_file_content)
        self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content)
        self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Type: request\r\n', warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=request',
                      warc_file_content)
        self.assertIn(b'WARC-Type: response', warc_file_content)
        self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=response',
                      warc_file_content)
        self.assertIn(
            'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'),
            warc_file_content
        )
        self.assertIn(
            'Python/{0}'.format(
                wpull.util.python_version()).encode('utf-8'),
            warc_file_content
        )
        self.assertIn(b'Extra-Field: my_extra_field', warc_file_content)
        self.assertIn(b'GET / HTTP', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)
        self.assertIn(b'FINISHED', warc_file_content)
        self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content)
        self.assertIn(b'Content-Length:', warc_file_content)
        self.assertNotIn(b'Content-Length: 0', warc_file_content)

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')
        cdx_fields = cdx_lines[1].split(b' ')

        print(cdx_lines)

        self.assertEqual(3, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))
        self.assertEqual(9, len(cdx_fields))
        self.assertTrue(cdx_lines[0].startswith(b' CDX'))

        self.assertEqual(b'http://example.com/', cdx_fields[0])
        self.assertEqual(b'-', cdx_fields[2])
        self.assertEqual(b'200', cdx_fields[3])
        self.assertNotEqual(b'-', cdx_fields[4])
        self.assertNotEqual(b'0', cdx_fields[5])
        self.assertNotEqual(b'0', cdx_fields[6])
        self.assertEqual(
            os.path.basename(warc_filename), cdx_fields[7].decode('ascii'))

        length = int(cdx_fields[5])
        offset = int(cdx_fields[6])

        with open(warc_filename, 'rb') as in_file:
            in_file.seek(offset)
            data = in_file.read(length)

            assert len(data) == length

        self.assertEqual(b'WARC/1.0', data[:8])

        self.assertIn(b'KITTEH DOGE', data)

        self.validate_warc(warc_filename)
Exemplo n.º 51
0
    def test_warc_recorder_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                extra_fields=[('Extra-field', 'my_extra_field')],
                cdx=True, max_size=1,
            )
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/2')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'DOGE KITTEH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf-00000.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)

        with open('asdf-00001.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'DOGE KITTEH', warc_file_content)

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')

        print(cdx_lines)

        self.assertEqual(4, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))

        self.assertIn(b'http://example.com/1', cdx_file_content)
        self.assertIn(b'http://example.com/2', cdx_file_content)

        with open('asdf-meta.warc', 'rb') as in_file:
            meta_file_content = in_file.read()

        self.assertIn(b'FINISHED', meta_file_content)

        self.validate_warc('asdf-00000.warc')
        self.validate_warc('asdf-00001.warc')
        self.validate_warc('asdf-meta.warc')