예제 #1
0
 def response_callback(request):
     request.prepare_for_send()
     self.assertTrue(request.url_info.url.endswith('robots.txt'))
     response = Response(500, 'Oops')
     response.request = request
     checker.web_client.session_obj.done_value = True
     return response
예제 #2
0
    def test_javascript_heavy_inline_monstrosity(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/test.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                in_file.seek(0x147)
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png', inline_urls)
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls)

        print('\n'.join(inline_urls))
        print('\n'.join(linked_urls))
예제 #3
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
        }, linked_urls)
        self.assertFalse(inline_urls)
예제 #4
0
    def test_response(self):
        response = Response(200, 'OK')
        response.fields['Cake'] = 'dolphin'

        self.assertEqual((b'HTTP/1.1 200 OK\r\n'
                          b'Cake: dolphin\r\n'
                          b'\r\n'), response.to_bytes())
예제 #5
0
    def test_adjust_extension(self):
        writer = AntiClobberFileWriter(self.get_path_namer(), adjust_extension=True)

        test_data = [
            ('text/html', '/mordor', 'mordor.html'),
            ('text/html', '/mordor?ring.asp', 'mordor?ring.asp.html'),
            ('text/html', '/mordor?ring.htm', 'mordor?ring.htm'),
            ('text/plain', '/static/my_file.txt', 'static/my_file.txt'),
            ('text/css', '/static/style.css', 'static/style.css'),
            ('text/css', '/static/style.css?hamster.exe', 'static/style.css?hamster.exe.css'),
            ('text/html', '/static/mojibake.html', 'static/mojibake.html'),
            ('text/html', '/static/mojibake.html?dolphin.png', 'static/mojibake.html?dolphin.png.html'),
        ]

        for mime_type, path, filename in test_data:
            session = writer.session()

            request = HTTPRequest('http://example.com' + path)
            response = HTTPResponse(status_code=200, reason='OK', request=request)
            response.fields['Content-Type'] = mime_type

            session.process_request(request)
            session.process_response(response)
            session.save_document(response)

            print(filename, list(os.walk('.')))
            self.assertTrue(os.path.exists(filename))
예제 #6
0
    def test_javascript_heavy_inline_monstrosity(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/test.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                in_file.seek(0x147)
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png',
            inline_urls
        )
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A',
            linked_urls
        )

        print('\n'.join(inline_urls))
        print('\n'.join(linked_urls))
예제 #7
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_scraper_links_base_href(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "basehref.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual("utf-8", scrape_result.encoding)

        self.assertEqual(
            {
                "http://cdn.example.com/stylesheet1.css",
                "http://www.example.com/stylesheet2.css",
                "http://example.com/a/stylesheet3.css",
                "http://example.com/a/dir/image1.png",
                "http://example.com/dir/image2.png",
                "http://example.net/image3.png",
                "http://example.com/dir/image4.png",
            },
            inline_urls,
        )
        self.assertEqual({"http://example.com/a/"}, linked_urls)
예제 #8
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/sitemap.xml")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b"""<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            """
            )

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://www.example.com/sitemap1.xml.gz"}, linked_urls)
        self.assertFalse(inline_urls)
예제 #9
0
    def test_content_disposition(self):
        writer = AntiClobberFileWriter(self.get_path_namer(),
                                       content_disposition=True)

        test_data = [
            ('hello1.txt', 'hello1.txt'),
            ('hello2.txt;', 'hello2.txt'),
            ('"hello3.txt"', 'hello3.txt'),
            ('\'hello4.txt\'', 'hello4.txt'),
        ]

        for raw_filename, filename in test_data:
            session = writer.session()

            request = HTTPRequest('http://example.com')
            response = HTTPResponse(status_code=200,
                                    reason='OK',
                                    request=request)
            response.fields[
                'Content-Disposition'] = 'attachment; filename={}'.format(
                    raw_filename)

            session.process_request(request)
            session.process_response(response)
            session.save_document(response)

            print(list(os.walk('.')))
            self.assertTrue(os.path.exists(filename))
예제 #10
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_wrong_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "kcna.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual("utf-16-le", scrape_result.encoding)

        self.assertEqual(
            {
                "http://example.com/utm/__utm.js",
                "http://example.com/Knewskage.gif",
                "http://example.com/Lline.gif",
                "http://example.com/Sline.gif",
                "http://example.com/korean01.gif",
                "http://example.com/korean02.gif",
                "http://example.com/english01.gif",
                "http://example.com/english02.gif",
                "http://example.com/Tongsinkage.gif",
                "http://example.com/Knewskage.gif",
            },
            inline_urls,
        )
        self.assertEqual({"http://example.com/index-k.htm", "http://example.com/index-e.htm"}, linked_urls)
예제 #11
0
    def test_html_scraper_links_base_href(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'basehref.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-8', scrape_result.encoding)

        self.assertEqual(
            {
                'http://cdn.example.com/stylesheet1.css',
                'http://www.example.com/stylesheet2.css',
                'http://example.com/a/stylesheet3.css',
                'http://example.com/a/dir/image1.png',
                'http://example.com/dir/image2.png',
                'http://example.net/image3.png',
                'http://example.com/dir/image4.png',
            }, inline_urls)
        self.assertEqual({'http://example.com/a/'}, linked_urls)
예제 #12
0
    def test_html_soup(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls)
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            }, linked_urls)
예제 #13
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)
예제 #14
0
    def test_rss_as_html(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(inline_urls)
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            }, linked_urls)
예제 #15
0
    def get_http_header(self) -> Response:
        '''Return the HTTP header.

        It only attempts to read the first 4 KiB of the payload.

        Returns:
            Response, None: Returns an instance of
            :class:`.http.request.Response` or None.
        '''
        with wpull.util.reset_file_offset(self.block_file):
            data = self.block_file.read(4096)

        match = re.match(br'(.*?\r?\n\r?\n)', data)

        if not match:
            return

        status_line, dummy, field_str = match.group(1).partition(b'\n')

        try:
            version, code, reason = Response.parse_status_line(status_line)
        except ValueError:
            return

        response = Response(status_code=code, reason=reason, version=version)

        try:
            response.fields.parse(field_str, strict=False)
        except ValueError:
            return

        return response
예제 #16
0
    def test_adjust_extension(self):
        writer = AntiClobberFileWriter(self.get_path_namer(),
                                       adjust_extension=True)

        test_data = [
            ('text/html', '/mordor', 'mordor.html'),
            ('text/html', '/mordor?ring.asp', 'mordor?ring.asp.html'),
            ('text/html', '/mordor?ring.htm', 'mordor?ring.htm'),
            ('text/plain', '/static/my_file.txt', 'static/my_file.txt'),
            ('text/css', '/static/style.css', 'static/style.css'),
            ('text/css', '/static/style.css?hamster.exe',
             'static/style.css?hamster.exe.css'),
            ('text/html', '/static/mojibake.html', 'static/mojibake.html'),
            ('text/html', '/static/mojibake.html?dolphin.png',
             'static/mojibake.html?dolphin.png.html'),
        ]

        for mime_type, path, filename in test_data:
            session = writer.session()

            request = HTTPRequest('http://example.com' + path)
            response = HTTPResponse(status_code=200,
                                    reason='OK',
                                    request=request)
            response.fields['Content-Type'] = mime_type

            session.process_request(request)
            session.process_response(response)
            session.save_document(response)

            print(filename, list(os.walk('.')))
            self.assertTrue(os.path.exists(filename))
예제 #17
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/sitemap.xml")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b"""<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            """
            )

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://www.example.com/"}, linked_urls)
        self.assertFalse(inline_urls)
예제 #18
0
    def test_http_response(self):
        response = Response(200, 'OK', version='HTTP/1.0')
        response.fields['hello'] = 'world'

        new_response = HTTPResponseInfoWrapper(response)
        info = new_response.info()

        self.assertEqual('world', info.get('hello'))
예제 #19
0
 def response_callback(request):
     request.prepare_for_send()
     self.assertTrue(request.url_info.url.endswith('robots.txt'))
     response = Response(200, 'OK')
     response.request = request
     response.body = io.StringIO('User-agent:*\nDisallow: /\n')
     checker.web_client.session_obj.done_value = True
     return response
예제 #20
0
    def test_response(self):
        response = Response(200, 'OK')
        response.fields['Cake'] = 'dolphin'

        self.assertEqual(
            (b'HTTP/1.1 200 OK\r\n'
             b'Cake: dolphin\r\n'
             b'\r\n'),
            response.to_bytes()
        )
예제 #21
0
    def test_xml_detect(self):
        self.assertTrue(
            XMLDetector.is_file(io.BytesIO(
                '<?xml version='.encode('utf-16le'))))
        self.assertFalse(
            XMLDetector.is_file(
                io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le'))))
        self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version')))
        self.assertTrue(
            XMLDetector.is_url(URLInfo.parse('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(
            XMLDetector.is_request(Request('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_request(Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'application/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(XMLDetector.is_response(response))
예제 #22
0
        def response_callback_3(request):
            request.prepare_for_send()
            self.assertEqual('http://www.example.net/robots.txt',
                             request.url_info.url)

            response = Response(200, 'OK')
            response.request = request
            response.body = io.StringIO('User-agent:*\nAllow: /\n')

            checker.web_client.session_obj.done_value = True
            return response
예제 #23
0
        def response_callback(request):
            request.prepare_for_send()
            self.assertTrue(request.url_info.url.endswith('robots.txt'))
            response = Response(302, 'See else')
            response.request = request
            response.fields['Location'] = '/robots.txt'

            nonlocal_dict['counter'] += 1

            if nonlocal_dict['counter'] > 20:
                raise ProtocolError('Mock redirect loop error.')

            return response
예제 #24
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_scraper_reject_type(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "many_urls.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response, link_type=LinkType.css)
        self.assertFalse(scrape_result)
예제 #25
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request('example.com/image.jpg'))
        )

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
예제 #26
0
    def test_js_detect(self):
        self.assertTrue(JavaScriptReader.is_file(io.BytesIO("var a = 1;".encode("utf-16le"))))
        self.assertTrue(JavaScriptReader.is_file(io.BytesIO("setTimeout(".encode("utf-16le"))))
        self.assertFalse(JavaScriptReader.is_file(io.BytesIO("hello world!".encode("utf-16le"))))
        self.assertFalse(JavaScriptReader.is_file(io.BytesIO(b"<html><body>hello")))
        self.assertTrue(JavaScriptReader.is_file(io.BytesIO(b"<html><body>hello")) is VeryFalse)

        response = Response(200, "OK")
        response.fields["Content-Type"] = "application/javascript"
        self.assertTrue(JavaScriptReader.is_response(response))

        response = Response(200, "OK")
        response.fields["Content-Type"] = "image/png"
        self.assertFalse(JavaScriptReader.is_response(response))
예제 #27
0
        def response_callback_2(request):
            request.prepare_for_send()
            self.assertEqual('http://www.example.com/robots.txt',
                             request.url_info.url)

            response = Response(301, 'Moved')
            response.fields['location'] = 'http://www.example.net/robots.txt'
            response.request = request

            checker.web_client.mock_response_callback = response_callback_3
            checker.web_client.request = Request(
                'http://www.example.net/robots.txt')

            return response
예제 #28
0
    def test_to_dict_body(self):
        request = Request()
        request.body = Body()
        request_dict = request.to_dict()

        self.assertTrue(request_dict['body'])
        request.body.close()

        request = Request()
        request.body = NotImplemented
        request_dict = request.to_dict()

        self.assertFalse(request_dict['body'])

        response = Response()
        response.body = Body()
        response_dict = response.to_dict()

        self.assertTrue(response_dict['body'])
        response.body.close()

        response = Response()
        response.body = NotImplemented
        response_dict = response.to_dict()

        self.assertFalse(response_dict['body'])
예제 #29
0
    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                max_size=1,
                appending=True
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
예제 #30
0
    def test_progress_http(self):
        progress = ProgressPrinter(stream=sys.stdout)

        request = HTTPRequest('http://example.com')
        response = HTTPResponse(206, 'OK')
        response.fields['Content-Size'] = '1024'
        response.fields['Content-Range'] = 'bytes 10-/2048'

        progress.update_from_begin_request(request)
        progress.update_from_begin_response(response)

        for dummy in range(100):
            progress.update_with_data(b'abc')

        progress.update_from_end_response(response)
예제 #31
0
    def test_sitemap_scraper_invalid_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/robots.txt")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b"dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk")

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertFalse(linked_urls)
        self.assertFalse(inline_urls)
예제 #32
0
파일: css_test.py 프로젝트: Super-Rad/wpull
    def test_css_scraper_reject_type(self):
        scraper = CSSScraper()
        request = Request('http://example.com/styles.css')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'styles.css')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response,
                                       link_type=LinkType.html)
        self.assertFalse(scrape_result)
예제 #33
0
    def test_sitemap_scraper_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/robots.txt")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b"Sitemap: http://example.com/sitemap00.xml")

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://example.com/sitemap00.xml"}, linked_urls)
        self.assertFalse(inline_urls)
예제 #34
0
    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())

        class BadRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                for dummy in range(1000):
                    yield b"where's my elephant?"
                raise OSError('Oops')

        session._request_record = BadRecord(session._request_record)
        original_offset = os.path.getsize(warc_filename)

        with self.assertRaises((OSError, IOError)):
            session.end_request(request)

        new_offset = os.path.getsize(warc_filename)
        self.assertEqual(new_offset, original_offset)
        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

        _logger.debug('original offset {0}'.format(original_offset))
예제 #35
0
파일: format.py 프로젝트: Super-Rad/wpull
    def get_http_header(self) -> Response:
        '''Return the HTTP header.

        It only attempts to read the first 4 KiB of the payload.

        Returns:
            Response, None: Returns an instance of
            :class:`.http.request.Response` or None.
        '''
        with wpull.util.reset_file_offset(self.block_file):
            data = self.block_file.read(4096)

        match = re.match(br'(.*?\r?\n\r?\n)', data)

        if not match:
            return

        status_line, dummy, field_str = match.group(1).partition(b'\n')

        try:
            version, code, reason = Response.parse_status_line(status_line)
        except ValueError:
            return

        response = Response(status_code=code, reason=reason, version=version)

        try:
            response.fields.parse(field_str, strict=False)
        except ValueError:
            return

        return response
예제 #36
0
    def test_javascript_reject_type(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/script.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'script.js')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request,
                                       response,
                                       link_type=LinkType.css)
        self.assertFalse(scrape_result)
예제 #37
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_serious_bad_encoding(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker, encoding_override="utf8")
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "text/html; charset=utf8"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "xkcd_1_evil.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
예제 #38
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_encoding_lxml_name_mismatch(self):
        """It should accept encoding names with underscore."""
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "text/html; charset=EUC_KR"

        with wpull.util.reset_file_offset(response.body):
            response.body.write("힖".encode("euc_kr"))

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
        self.assertEqual("euc_kr", scrape_info["encoding"])
예제 #39
0
    def test_bad_xml(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'foxstripcomics_bad_xml.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        # No crash
        scraper.scrape(request, response, link_type=LinkType.html)
예제 #40
0
    def test_sitemap_scraper_invalid_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/robots.txt')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b'dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertFalse(linked_urls)
        self.assertFalse(inline_urls)
예제 #41
0
    def test_response_empty_reason_line(self):
        response = Response()
        response.parse(b'HTTP/1.0 200\r\n')
        response.parse(b'Cake: dolphin\r\n')
        response.parse(b'\r\n')

        self.assertEqual(200, response.status_code)
        self.assertEqual('', response.reason)
        self.assertEqual('dolphin', response.fields['Cake'])
예제 #42
0
    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(compress=False,
                                      max_size=1,
                                      appending=True),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
예제 #43
0
    def test_html_encoding_lxml_name_mismatch(self):
        '''It should accept encoding names with underscore.'''
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=EUC_KR'

        with wpull.util.reset_file_offset(response.body):
            response.body.write('힖'.encode('euc_kr'))

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
        self.assertEqual('euc_kr', scrape_info['encoding'])
예제 #44
0
    def test_sitemap_scraper_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/robots.txt')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'Sitemap: http://example.com/sitemap00.xml')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://example.com/sitemap00.xml',
        }, linked_urls)
        self.assertFalse(inline_urls)
예제 #45
0
    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(warc_prefix,
                                     params=WARCRecorderParams(
                                         compress=False, ))

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())

        class BadRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                for dummy in range(1000):
                    yield b"where's my elephant?"
                raise OSError('Oops')

        session._request_record = BadRecord(session._request_record)
        original_offset = os.path.getsize(warc_filename)

        with self.assertRaises((OSError, IOError)):
            session.end_request(request)

        new_offset = os.path.getsize(warc_filename)
        self.assertEqual(new_offset, original_offset)
        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

        _logger.debug('original offset {0}'.format(original_offset))
예제 #46
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_not_quite_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "videogame_top.htm")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn("http://example.com/copyright_2001_2006_rtype.gif", inline_urls)
        self.assertIn("http://www.geocities.jp/gamehouse_grindcrusher/", linked_urls)
예제 #47
0
    def test_html_garbage(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html'

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f'
                b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo'
                b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7')

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
예제 #48
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_garbage(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "text/html"

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b"\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f"
                b"\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo"
                b"\x7f\xff\xff\xffV\xc1\xff\x7f\xff7"
            )

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
예제 #49
0
    def test_html_scraper_reject_type(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'many_urls.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request,
                                       response,
                                       link_type=LinkType.css)
        self.assertFalse(scrape_result)
예제 #50
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_xhtml_invalid(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "xhtml_invalid.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://example.com/image.png", "http://example.com/script.js"}, inline_urls)
        self.assertEqual({"http://example.com/link"}, linked_urls)
예제 #51
0
    def test_to_dict(self):
        request = Request('https://foofle.com')
        request_dict = request.to_dict()

        self.assertEqual('https://foofle.com', request_dict['url'])
        self.assertEqual('https', request_dict['url_info']['scheme'])
        self.assertEqual('GET', request_dict['method'])
        self.assertEqual('http', request_dict['protocol'])

        response = Response(status_code=200, reason='OK', request=request)
        response_dict = response.to_dict()

        self.assertEqual('https://foofle.com', response_dict['request']['url'])
        self.assertEqual('http', response_dict['protocol'])
        self.assertEqual(200, response_dict['status_code'])
        self.assertEqual(200, response_dict['response_code'])
        self.assertEqual('OK', response_dict['reason'])
        self.assertEqual('OK', response_dict['response_message'])
예제 #52
0
    def test_warc_recorder_journal(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        test_instance = self

        class MockRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                print(list(os.walk('.')))
                test_instance.assertTrue(
                    os.path.exists(warc_filename + '-wpullinc')
                )

                for dummy in range(1000):
                    yield b"where's my elephant?"

        session._request_record = MockRecord(session._request_record)

        session.end_request(request)

        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
예제 #53
0
    def test_redirect_tracker(self):
        tracker = RedirectTracker(5)

        self.assertFalse(tracker.is_redirect())
        self.assertFalse(tracker.is_repeat())
        self.assertFalse(tracker.exceeded())
        self.assertFalse(tracker.next_location(raw=True))
        self.assertEqual(0, tracker.count())

        response = Response(200, 'OK')

        tracker.load(response)

        self.assertFalse(tracker.is_redirect())
        self.assertFalse(tracker.is_repeat())
        self.assertFalse(tracker.exceeded())
        self.assertFalse(tracker.next_location())
        self.assertEqual(0, tracker.count())

        response = Response(303, 'See other')
        response.fields['location'] = '/test'

        tracker.load(response)

        self.assertTrue(tracker.is_redirect())
        self.assertFalse(tracker.is_repeat())
        self.assertFalse(tracker.exceeded())
        self.assertEqual('/test', tracker.next_location(raw=True))
        self.assertEqual(1, tracker.count())

        response = Response(307, 'Temporary redirect')
        response.fields['location'] = '/test'

        tracker.load(response)
        tracker.load(response)
        tracker.load(response)
        tracker.load(response)
        tracker.load(response)

        self.assertTrue(tracker.is_redirect())
        self.assertTrue(tracker.is_repeat())
        self.assertTrue(tracker.exceeded())
        self.assertEqual('/test', tracker.next_location(raw=True))
        self.assertEqual(6, tracker.count())
예제 #54
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_html_soup(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["Refresh"] = "yes"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "soup.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://example.com/ABOUTM~1.JPG"}, inline_urls)
        self.assertEqual({"http://example.com/BLOG", "http://example.com/web ring/Join.htm"}, linked_urls)
예제 #55
0
    def test_response_empty_reason_line(self):
        response = Response()
        response.parse(b'HTTP/1.0 200\r\n')
        response.parse(b'Cake: dolphin\r\n')
        response.parse(b'\r\n')

        self.assertEqual(200, response.status_code)
        self.assertEqual('', response.reason)
        self.assertEqual('dolphin', response.fields['Cake'])
예제 #56
0
파일: html_test.py 프로젝트: charygao/wpull
    def test_rss_as_html(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "application/rss+xml"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "rss.xml")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(inline_urls)
        self.assertEqual({"http://www.someexamplerssdomain.com/main.html", "http://www.wikipedia.org/"}, linked_urls)
예제 #57
0
    def test_javascript_scraper(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/script.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'script.js')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://example.com/script_variable.png',
            'http://example.com/dragonquery.js',
        },
            inline_urls
        )
        self.assertEqual({
            'http://example.com/document_write.html',
            'http://example.com/http_document_write.html',
            'http://example.com/http_document_write2.html',
            'http://example.com/http document write.html',
            'http://example.com/script_variable.html',
            'http://example.com/http_script_variable.html',
            'https://example.com/https_script_variable.html',
            'ftp://example.com/ftp_script_variable.html',
            'http://example.com/end_dir_script_variable/',
            'http://example.com/start_dir_script_variable',
            'http://example.com/../relative_dir_script_variable'
            if sys.version_info < (3, 5) else
            'http://example.com/relative_dir_script_variable',
            'http://example.com/script_json.html',
            'http://example.com/http_script_json.html?a=b',
        },
            linked_urls
        )
예제 #58
0
파일: stream.py 프로젝트: Super-Rad/wpull
    def read_response(self, response=None):
        '''Read the response's HTTP status line and header fields.

        Coroutine.
        '''
        _logger.debug('Reading header.')

        if response is None:
            response = Response()

        header_lines = []
        bytes_read = 0

        while True:
            try:
                data = yield from self._connection.readline()
            except ValueError as error:
                raise ProtocolError(
                    'Invalid header: {0}'.format(error)) from error

            self._data_event_dispatcher.notify_read(data)

            if not data.endswith(b'\n'):
                raise NetworkError('Connection closed.')
            elif data in (b'\r\n', b'\n'):
                break

            header_lines.append(data)
            assert data.endswith(b'\n')

            bytes_read += len(data)

            if bytes_read > 32768:
                raise ProtocolError('Header too big.')

        if not header_lines:
            raise ProtocolError('No header received.')

        response.parse(b''.join(header_lines))

        return response