示例#1
0
    def test_app_args_warc(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--warc-file',
            'test',
            '-4',
            '--no-robots',
            '--no-warc-digests',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('test.warc.gz'))

        with gzip.GzipFile('test.warc.gz') as in_file:
            data = in_file.read()
            self.assertIn(b'FINISHED', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
示例#2
0
    def test_app_args_warc_dedup(self):
        arg_parser = AppArgumentParser()

        with open('dedup.cdx', 'wb') as out_file:
            out_file.write(b' CDX a k u\n')
            out_file.write(
                self.get_url('/static/my_file.txt').encode('ascii')
            )
            out_file.write(b' KQ4IUKATKL63FT5GMAE2YDRV3WERNL34')
            out_file.write(b' <under-the-deer>\n')

        args = arg_parser.parse_args([
            self.get_url('/static/my_file.txt'),
            '--no-parent',
            '--warc-file', 'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--warc-dedup', 'dedup.cdx',
        ])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        with open('test.warc', 'rb') as in_file:
            data = in_file.read()

            self.assertIn(b'KQ4IUKATKL63FT5GMAE2YDRV3WERNL34', data)
            self.assertIn(b'Type: revisit', data)
            self.assertIn(b'<under-the-deer>', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
示例#3
0
    def test_app_args_warc(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--warc-file', 'test',
            '-4',
            '--no-robots',
            '--no-warc-digests',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('test.warc.gz'))

        with gzip.GzipFile('test.warc.gz') as in_file:
            data = in_file.read()
            self.assertIn(b'FINISHED', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
示例#4
0
    def test_propagate_ipv4_only_and_no_cert_check_to_youtube_dl(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            'https://www.youtube.com/watch?v=tPEE9ZwTmy0',
            '--warc-file',
            'test',
            '--debug',  # to capture youtube-dl arguments in the log
            '--no-warc-compression',
            '--youtube-dl',
            '--inet4-only',
            '--no-check-certificate',
            '--output-file',
            'test.log'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)

        with open('test.log', 'rb') as test_log:
            data = test_log.read()

            self.assertTrue(
                re.search(b'Starting process \[\'youtube-dl.*--force-ipv4',
                          data))
            self.assertTrue(
                re.search(
                    b'Starting process \[\'youtube-dl.*--no-check-certificate',
                    data))
示例#5
0
    def test_app_args_warc_dedup(self):
        arg_parser = AppArgumentParser()

        with open('dedup.cdx', 'wb') as out_file:
            out_file.write(b' CDX a k u\n')
            out_file.write(self.get_url('/static/my_file.txt').encode('ascii'))
            out_file.write(b' KQ4IUKATKL63FT5GMAE2YDRV3WERNL34')
            out_file.write(b' <under-the-deer>\n')

        args = arg_parser.parse_args([
            self.get_url('/static/my_file.txt'),
            '--no-parent',
            '--warc-file',
            'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--warc-dedup',
            'dedup.cdx',
        ])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        with open('test.warc', 'rb') as in_file:
            data = in_file.read()

            self.assertIn(b'KQ4IUKATKL63FT5GMAE2YDRV3WERNL34', data)
            self.assertIn(b'Type: revisit', data)
            self.assertIn(b'<under-the-deer>', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
示例#6
0
    def test_save_cookie(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(b'# Kittens\n')
            in_file.write(b'localhost.local')
            in_file.write(b'\tFALSE\t/\tFALSE\t9999999999\tisloggedin\t1\n')
            in_file.write(b'\tFALSE\t/\tFALSE\t\tadmin\t1\n')
            in_file.flush()

            args = arg_parser.parse_args([
                self.get_url('/some_page/'),
                '--load-cookies',
                in_file.name,
                '--tries',
                '1',
                '--save-cookies',
                'wpull_test_cookies.txt',
            ])
            builder = Builder(args, unit_test=True)

            app = builder.build()
            exit_code = yield from app.run()

            self.assertEqual(0, exit_code)
            self.assertEqual(1, builder.factory['Statistics'].files)

            with open('wpull_test_cookies.txt', 'rb') as saved_file:
                cookie_data = saved_file.read()

            self.assertIn(b'isloggedin\t1', cookie_data)
            self.assertNotIn(b'admin\t1', cookie_data)
示例#7
0
 def test_app_args_post_data(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([self.get_url("/post/"), "--post-data", "text=hi"])
     builder = Builder(args, unit_test=True)
     app = builder.build()
     exit_code = yield from app.run()
     self.assertEqual(0, exit_code)
示例#8
0
    def test_big_payload(self):
        hash_obj = hashlib.sha1(b'foxfoxfox')
        payload_list = []

        for dummy in range(10000):
            data = hash_obj.digest()
            hash_obj.update(data)
            payload_list.append(data)

        data = hash_obj.digest()
        payload_list.append(data)
        expected_payload = b''.join(payload_list)

        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/big_payload')])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        self.assertTrue(os.path.exists('big_payload'))

        with open('big_payload', 'rb') as in_file:
            self.assertEqual(expected_payload, in_file.read())

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
示例#9
0
    def test_app_phantomjs_scroll(self):
        arg_parser = AppArgumentParser()

        # Change localhost into something else to test proxy
        args = arg_parser.parse_args([
            self.get_url('/static/DEUUEAUGH.html').replace(
                'localhost', 'example.invalid'),
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-wait',
            '0.4',
            '--phantomjs-scroll',
            '20',
            '--no-check-certificate',
        ])
        builder = Builder(args, unit_test=True)
        builder.factory.class_map['Resolver'] = MockDNSResolver

        app = builder.build()
        exit_code = yield from app.run()

        with open('DEUUEAUGH.html.snapshot.html', 'rb') as in_file:
            data = in_file.read()
            self.assertIn(b'Count: 10', data)

        self.assertEqual(0, exit_code)
示例#10
0
    def test_timestamping_hit_orig(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url('/lastmod'), '--timestamping'])

        filename = os.path.join(self.temp_dir.name, 'lastmod')
        filename_orig = os.path.join(self.temp_dir.name, 'lastmod')

        with open(filename, 'wb') as out_file:
            out_file.write(b'HI')

        with open(filename_orig, 'wb') as out_file:
            out_file.write(b'HI')

        os.utime(filename_orig, (631152000, 631152000))

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)

        with open(filename, 'rb') as in_file:
            self.assertEqual(b'HI', in_file.read())

        with open(filename_orig, 'rb') as in_file:
            self.assertEqual(b'HI', in_file.read())
示例#11
0
    def test_session_cookie(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(b"# Kittens\n")
            in_file.write(b"localhost.local")
            # session cookie, Python style
            in_file.write(b"\tFALSE\t/\tFALSE\t\ttest\tno\n")
            # session cookie, Firefox/Wget/Curl style
            in_file.write(b"\tFALSE\t/\tFALSE\t0\tsessionid\tboxcat\n")
            in_file.flush()

            args = arg_parser.parse_args(
                [
                    self.get_url("/cookie"),
                    "--load-cookies",
                    in_file.name,
                    "--tries",
                    "1",
                    "--save-cookies",
                    "wpull_test_cookies.txt",
                    "--keep-session-cookies",
                ]
            )
            builder = Builder(args, unit_test=True)

            app = builder.build()

            callback_called = False

            def callback(pipeline):
                nonlocal callback_called

                if callback_called:
                    return

                callback_called = True
                self.assertEqual(2, len(builder.factory["CookieJar"]))

            app.event_dispatcher.add_listener(Application.Event.pipeline_end, callback)

            exit_code = yield from app.run()

            self.assertTrue(callback_called)

            self.assertEqual(0, exit_code)
            self.assertEqual(1, builder.factory["Statistics"].files)

            cookies = list(sorted(builder.factory["CookieJar"], key=lambda cookie: cookie.name))
            _logger.debug("{0}".format(cookies))
            self.assertEqual(2, len(cookies))
            self.assertEqual("sessionid", cookies[0].name)
            self.assertEqual("boxcat", cookies[0].value)
            self.assertEqual("test", cookies[1].name)
            self.assertEqual("yes", cookies[1].value)

            with open("wpull_test_cookies.txt", "rb") as saved_file:
                cookie_data = saved_file.read()

            self.assertIn(b"test\tyes", cookie_data)
示例#12
0
    def test_app_python_plugin_script(self):
        arg_parser = AppArgumentParser()
        filename = os.path.join(os.path.dirname(__file__),
                                'sample_user_scripts', 'extensive.plugin.py')
        args = arg_parser.parse_args([
            self.get_url('/'),
            self.get_url('/some_page'),
            self.get_url('/mordor'), 'localhost:1/wolf', '--plugin-script',
            filename, '--page-requisites', '--reject-regex', '/post/',
            '--wait', '12', '--retry-connrefused', '--tries', '1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(42, exit_code)

        engine = builder.factory['PipelineSeries']
        self.assertEqual(2, engine.concurrency)

        stats = builder.factory['Statistics']

        self.assertEqual(3, stats.files)

        # duration should be virtually 0 but account for slowness on travis ci
        self.assertGreater(10.0, stats.duration)
示例#13
0
    def test_timestamping_hit_orig(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/lastmod"), "--timestamping"])

        filename = os.path.join(self.temp_dir.name, "lastmod")
        filename_orig = os.path.join(self.temp_dir.name, "lastmod")

        with open(filename, "wb") as out_file:
            out_file.write(b"HI")

        with open(filename_orig, "wb") as out_file:
            out_file.write(b"HI")

        os.utime(filename_orig, (631152000, 631152000))

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)

        with open(filename, "rb") as in_file:
            self.assertEqual(b"HI", in_file.read())

        with open(filename_orig, "rb") as in_file:
            self.assertEqual(b"HI", in_file.read())
示例#14
0
 def test_app_python_plugin_script(self):
     arg_parser = AppArgumentParser()
     filename = os.path.join(os.path.dirname(__file__),
                             'sample_user_scripts', 'extensive.plugin.py')
     args = arg_parser.parse_args([
         self.get_url('/'),
         self.get_url('/some_page'),
         self.get_url('/mordor'),
         'localhost:1/wolf',
         '--plugin-script', filename,
         '--page-requisites',
         '--reject-regex', '/post/',
         '--wait', '12',
         '--retry-connrefused', '--tries', '1'
     ])
     builder = Builder(args, unit_test=True)
 
     app = builder.build()
     exit_code = yield from app.run()
     print(list(os.walk('.')))
 
     self.assertEqual(42, exit_code)
 
     engine = builder.factory['PipelineSeries']
     self.assertEqual(2, engine.concurrency)
 
     stats = builder.factory['Statistics']
 
     self.assertEqual(3, stats.files)
 
     # duration should be virtually 0 but account for slowness on travis ci
     self.assertGreater(10.0, stats.duration)
示例#15
0
    def test_save_cookie(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(b"# Kittens\n")
            in_file.write(b"localhost.local")
            in_file.write(b"\tFALSE\t/\tFALSE\t9999999999\tisloggedin\t1\n")
            in_file.write(b"\tFALSE\t/\tFALSE\t\tadmin\t1\n")
            in_file.flush()

            args = arg_parser.parse_args(
                [
                    self.get_url("/some_page/"),
                    "--load-cookies",
                    in_file.name,
                    "--tries",
                    "1",
                    "--save-cookies",
                    "wpull_test_cookies.txt",
                ]
            )
            builder = Builder(args, unit_test=True)

            app = builder.build()
            exit_code = yield from app.run()

            self.assertEqual(0, exit_code)
            self.assertEqual(1, builder.factory["Statistics"].files)

            with open("wpull_test_cookies.txt", "rb") as saved_file:
                cookie_data = saved_file.read()

            self.assertIn(b"isloggedin\t1", cookie_data)
            self.assertNotIn(b"admin\t1", cookie_data)
示例#16
0
文件: main.py 项目: charygao/wpull
def main(exit=True, install_tornado_bridge=True, use_signals=True):
    if install_tornado_bridge:
        tornado.platform.asyncio.AsyncIOMainLoop().install()

    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()

    builder = Builder(args)
    application = builder.build()

    if use_signals:
        application.setup_signal_handlers()

    if args.debug_manhole:
        import manhole
        import wpull

        wpull.wpull_builder = builder
        manhole.install()

    exit_code = application.run_sync()

    if exit:
        sys.exit(exit_code)
    else:
        return exit_code
示例#17
0
    def test_ssl_bad_certificate(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-robots',
            '--no-check-certificate',
            '--tries', '1'
        ])
        builder = Builder(args, unit_test=True)

        class MockWebSession(WebSession):
            @asyncio.coroutine
            def start(self):
                raise SSLVerificationError('A very bad certificate!')

        class MockWebClient(builder.factory.class_map['WebClient']):
            def session(self, request):
                return MockWebSession(request, self._http_client, self._redirect_tracker_factory(), Request)

        builder.factory.class_map['WebClient'] = MockWebClient

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(7, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
示例#18
0
def main(exit=True, install_tornado_bridge=True, use_signals=True):
    if install_tornado_bridge:
        tornado.platform.asyncio.AsyncIOMainLoop().install()

    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()

    builder = Builder(args)
    application = builder.build()

    if use_signals:
        application.setup_signal_handlers()

    if args.debug_manhole:
        import manhole
        import wpull
        wpull.wpull_builder = builder
        manhole.install()

    exit_code = application.run_sync()

    if exit:
        sys.exit(exit_code)
    else:
        return exit_code
示例#19
0
    def test_big_payload(self):
        hash_obj = hashlib.sha1(b"foxfoxfox")
        payload_list = []

        for dummy in range(10000):
            data = hash_obj.digest()
            hash_obj.update(data)
            payload_list.append(data)

        data = hash_obj.digest()
        payload_list.append(data)
        expected_payload = b"".join(payload_list)

        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/big_payload")])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        self.assertTrue(os.path.exists("big_payload"))

        with open("big_payload", "rb") as in_file:
            self.assertEqual(expected_payload, in_file.read())

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory["Statistics"].files)
示例#20
0
    def test_ssl_bad_certificate(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'), '--no-robots', '--no-check-certificate',
            '--tries', '1'
        ])
        builder = Builder(args, unit_test=True)

        class MockWebSession(WebSession):
            @asyncio.coroutine
            def start(self):
                raise SSLVerificationError('A very bad certificate!')

        class MockWebClient(builder.factory.class_map['WebClient']):
            def session(self, request):
                return MockWebSession(request, self._http_client,
                                      self._redirect_tracker_factory(),
                                      Request)

        builder.factory.class_map['WebClient'] = MockWebClient

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(7, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
示例#21
0
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        script_filename = os.path.join(os.path.dirname(__file__),
                                       'sample_user_scripts',
                                       'boring.plugin.py')

        # Change localhost into something else to test proxy
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html').replace(
                'localhost', 'example.invalid'),
            '--warc-file',
            'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-exe',
            'phantomjs',
            '--phantomjs-wait',
            '0.1',
            '--phantomjs-scroll',
            '2',
            '--header',
            'accept-language: dragon',
            '--plugin-script',
            script_filename,
            '--no-check-certificate',
        ])
        builder = Builder(args, unit_test=True)
        builder.factory.class_map['Resolver'] = MockDNSResolver

        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('test.warc'))
        self.assertTrue(os.path.exists('simple_javascript.html.snapshot.html'))
        self.assertTrue(os.path.exists('simple_javascript.html.snapshot.pdf'))

        with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
            data = in_file.read()
            self.assertIn(b'Hello world!', data)

        with open('test.warc', 'rb') as in_file:
            data = in_file.read()

            self.assertIn(b'urn:X-wpull:snapshot?url=', data)
            self.assertIn(b'text/html', data)
            self.assertIn(b'application/pdf', data)
            self.assertIn(b'application/json', data)
            self.assertIn(b'"set_scroll_top"', data)
            try:
                self.assertIn(b'Accept-Encoding: identity', data)
            except AssertionError:
                # webkit treats localhost differently
                self.assertNotIn(b'Accept-Encoding: gzip', data)
            self.assertIn(b'Accept-Language: dragon', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
示例#22
0
    def test_no_content(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/no_content"), "--tries=1"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory["Statistics"].files)
示例#23
0
    def test_strip_session_id(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/forum/"), "-r", "--strip-session-id"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory["Statistics"].files)
示例#24
0
    def test_non_http_redirect(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/non_http_redirect"), "--recursive", "--no-robots"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory["Statistics"].files)
示例#25
0
    def test_ignore_length(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/underrun"), "--ignore-length", "--no-robots"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory["Statistics"].files)
示例#26
0
    def test_referer_option(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/referrer/"), "-r", "--referer", "http://left.shark/"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory["Statistics"].files)
示例#27
0
    def test_session_timeout(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/sleep_long"), "--tries=1", "--session-timeout=0.1"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(4, exit_code)
        self.assertEqual(0, builder.factory["Statistics"].files)
示例#28
0
    def test_immediate_robots_forbidden(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/forbidden"), "--recursive"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory["Statistics"].files)
示例#29
0
    def test_escaped_fragment_recursive(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/escape_from_fragments/"), "-r", "--escaped-fragment"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory["Statistics"].files)
示例#30
0
    def test_misc_urls(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args(["http://[0:0:0:0:0:ffff:a00:0]/", "--tries", "1", "--timeout", "0.5", "-r"])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(4, exit_code)
示例#31
0
    def test_database_uri(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args([self.get_url("/"), "--database-uri", "sqlite:///test.db"])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
示例#32
0
    def test_basic_auth_fail(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/basic_auth"), "--user", "root", "--password", "toothless"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory["Statistics"].files)
示例#33
0
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        script_filename = os.path.join(os.path.dirname(__file__),
                                       'sample_user_scripts', 'boring.plugin.py')

        # Change localhost into something else to test proxy
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html').replace('localhost', 'example.invalid'),
            '--warc-file', 'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-exe', 'phantomjs',
            '--phantomjs-wait', '0.1',
            '--phantomjs-scroll', '2',
            '--header', 'accept-language: dragon',
            '--plugin-script', script_filename,
            '--no-check-certificate',
        ])
        builder = Builder(args, unit_test=True)
        builder.factory.class_map['Resolver'] = MockDNSResolver

        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('test.warc'))
        self.assertTrue(
            os.path.exists('simple_javascript.html.snapshot.html')
        )
        self.assertTrue(
            os.path.exists('simple_javascript.html.snapshot.pdf')
        )

        with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
            data = in_file.read()
            self.assertIn(b'Hello world!', data)

        with open('test.warc', 'rb') as in_file:
            data = in_file.read()

            self.assertIn(b'urn:X-wpull:snapshot?url=', data)
            self.assertIn(b'text/html', data)
            self.assertIn(b'application/pdf', data)
            self.assertIn(b'application/json', data)
            self.assertIn(b'"set_scroll_top"', data)
            try:
                self.assertIn(b'Accept-Encoding: identity', data)
            except AssertionError:
                # webkit treats localhost differently
                self.assertNotIn(b'Accept-Encoding: gzip', data)
            self.assertIn(b'Accept-Language: dragon', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
示例#34
0
    def test_database_path_question_mark(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args([self.get_url("/"), "--database", "test?.db"])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertTrue(os.path.exists("test_.db"))
示例#35
0
 def test_app_args_post_data(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/post/'),
         '--post-data',
         'text=hi',
     ])
     builder = Builder(args, unit_test=True)
     app = builder.build()
     exit_code = yield from app.run()
     self.assertEqual(0, exit_code)
示例#36
0
    def test_many_page_with_some_fail(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/blog/"), "--no-parent", "--recursive", "--page-requisites", "-4"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(ExitStatus.server_error, exit_code)
        self.assertGreater(builder.factory["Statistics"].files, 1)
        self.assertGreater(builder.factory["Statistics"].duration, 3)
示例#37
0
    def test_no_iri(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url('/'), '--no-iri', '--no-robots'])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
示例#38
0
    def test_iri_handling(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/static/mojibake.html"), "-r", "--database", "temp-unittest.db"])
        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        urls = tuple(url_record.url for url_record in builder.factory["URLTable"].get_all())
        self.assertIn(self.get_url("/%E6%96%87%E5%AD%97%E5%8C%96%E3%81%91"), urls)

        self.assertEqual(0, exit_code)
示例#39
0
    def test_database_uri(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args(
            [self.get_url('/'), '--database-uri', 'sqlite:///test.db'])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
示例#40
0
    def test_quota(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/blog/"), "--recursive", "--quota", "1"])

        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory["Statistics"].files)
示例#41
0
    def test_escaped_fragment_recursive(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/escape_from_fragments/'), '-r', '--escaped-fragment'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)
示例#42
0
    def test_immediate_robots_error(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            ["http://127.0.0.1:1", self.get_url("/"), "--recursive", "--tries", "1", "--timeout", "10"]
        )
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(4, exit_code)
        self.assertEqual(1, builder.factory["Statistics"].files)
示例#43
0
    def test_referer_option(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/referrer/'), '-r', '--referer', 'http://left.shark/'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)
示例#44
0
    def test_session_timeout(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/sleep_long'), '--tries=1', '--session-timeout=0.1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(4, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
示例#45
0
    def test_check_certificate(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-robots',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(5, exit_code)
示例#46
0
    def test_escaped_fragment_input_url(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url("/escape_from_fragments/#!husky-cat"), "--escaped-fragment"])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory["Statistics"].files)

        self.assertTrue(os.path.exists("index.html?_escaped_fragment_=husky-cat"))
示例#47
0
    def test_database_path_question_mark(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args(
            [self.get_url('/'), '--database', 'test?.db'])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertTrue(os.path.exists('test_.db'))
示例#48
0
    def test_page_requisite_level(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url("/infinite_iframe/"), "-r", "--page-requisites", "--page-requisites-level", "1"]
        )
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory["Statistics"].files)
示例#49
0
    def test_link_type(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url("/always200/"), "-r", "--page-requisites", "--page-requisites-level", "2"]
        )
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(4, builder.factory["Statistics"].files)
示例#50
0
    def test_globbing(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/read*.txt'),
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
示例#51
0
    def test_login_fail(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/example (copy).txt'), '--user', 'smaug',
            '--password', 'hunter2', '--tries', '1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(6, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
示例#52
0
    def test_immediate_robots_forbidden(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/forbidden'),
            '--recursive',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
示例#53
0
    def test_file_vs_directory(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/example2💎'), '--no-host-directories',
            '--no-remove-listing', '-r', '-l=1', '--tries=1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(0, exit_code)
        self.assertTrue(os.path.exists('example2💎/.listing'))
示例#54
0
    def test_app_python_script_stop(self):
        arg_parser = AppArgumentParser()
        filename = os.path.join(os.path.dirname(__file__),
                                'sample_user_scripts', 'stopper.plugin.py')
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--plugin-script',
            filename,
        ])
        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(1, exit_code)
示例#55
0
    def test_referer_option_negative(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/referrer/'), '-r', '--referer',
            'http://superinformation.highway/', '--tries', '1', '--waitretry',
            '.1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
示例#56
0
    def test_strip_session_id(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/forum/'),
            '-r',
            '--strip-session-id',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
示例#57
0
    def test_output_document(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args(
            [self.get_url('/'), '--output-document', 'blah.dat'])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('blah.dat'))
        self.assertTrue(os.path.getsize('blah.dat'))

        self.assertEqual(0, exit_code)
示例#58
0
    def test_invalid_char_dir_list(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/hidden/invalid_chars/'),
            '--no-host-directories',
            '--no-remove-listing',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(0, exit_code)
        self.assertTrue(os.path.exists('.listing'))
示例#59
0
    def test_long_cookie(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/long_cookie'),
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)

        cookies = list(builder.factory['CookieJar'])
        _logger.debug('{0}'.format(cookies))
        self.assertEqual(0, len(cookies))
示例#60
0
    def test_app_input_file_arg(self):
        arg_parser = AppArgumentParser(real_exit=False)
        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(self.get_url('/').encode('utf-8'))
            in_file.write(b'\n')
            in_file.write(self.get_url('/blog/?ðfßðfëéå').encode('utf-8'))
            in_file.flush()

            args = arg_parser.parse_args(['--input-file', in_file.name])
            builder = Builder(args, unit_test=True)
            app = builder.build()
            exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(builder.factory['Statistics'].files, 2)