예제 #1
0
def main(exit=True, install_tornado_bridge=True, prefer_trollius=True):
    if prefer_trollius:
        try:
            import asyncio
        except ImportError:
            pass
        else:
            asyncio.set_event_loop_policy(trollius.get_event_loop_policy())

    if install_tornado_bridge:
        tornado.platform.asyncio.AsyncIOMainLoop().install()

    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()

    builder = Builder(args)
    builder.build()

    application = builder.factory['Application']
    application.setup_signal_handlers()

    if args.debug_manhole:
        import manhole
        import wpull
        wpull.wpull_builder = builder
        manhole.install()

    exit_code = application.run_sync()

    if exit:
        sys.exit(exit_code)
    else:
        return exit_code
예제 #2
0
    def test_cookie(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(b'# Kittens\n')
            in_file.write(b'localhost.local')
            in_file.write(b'\tFALSE\t/\tFALSE\t\ttest\tno\n')
            in_file.flush()

            args = arg_parser.parse_args([
                self.get_url('/cookie'),
                '--load-cookies', in_file.name,
                '--tries', '1',
                '--save-cookies', 'wpull_test_cookies.txt',
                '--keep-session-cookies',
            ])
            builder = Builder(args)

            with cd_tempdir():
                app = builder.build()
                exit_code = yield app.run()

                self.assertEqual(0, exit_code)
                self.assertEqual(1, builder.factory['Statistics'].files)

                cookies = list(builder.factory['CookieJar'])
                _logger.debug('{0}'.format(cookies))
                self.assertEqual(1, len(cookies))
                self.assertEqual('test', cookies[0].name)
                self.assertEqual('yes', cookies[0].value)

                with open('wpull_test_cookies.txt', 'rb') as saved_file:
                    cookie_data = saved_file.read()

                self.assertIn(b'test\tyes', cookie_data)
예제 #3
0
    def test_new_file_and_clobber(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/static/my_file.txt')])

        with cd_tempdir() as temp_dir:
            app = Builder(args).build()
            exit_code = yield app.run()

            self.assertEqual(0, exit_code)

            expected_filename = os.path.join(temp_dir, 'my_file.txt')

            self.assertTrue(os.path.exists(expected_filename))

            with open(expected_filename, 'rb') as in_file:
                self.assertIn(b'END', in_file.read())

            app = Builder(args).build()
            exit_code = yield app.run()

            self.assertEqual(0, exit_code)

            expected_filename = os.path.join(temp_dir, 'my_file.txt.1')

            self.assertTrue(os.path.exists(expected_filename))
예제 #4
0
    def test_big_payload(self):
        hash_obj = hashlib.sha1(b'foxfoxfox')
        payload_list = []

        for dummy in range(10000):
            data = hash_obj.digest()
            hash_obj.update(data)
            payload_list.append(data)

        data = hash_obj.digest()
        payload_list.append(data)
        expected_payload = b''.join(payload_list)

        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/big_payload')])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()
            self.assertTrue(os.path.exists('big_payload'))

            with open('big_payload', 'rb') as in_file:
                self.assertEqual(expected_payload, in_file.read())

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
예제 #5
0
    def test_app_args_warc_dedup(self):
        arg_parser = AppArgumentParser()

        with cd_tempdir():
            with open('dedup.cdx', 'wb') as out_file:
                out_file.write(b' CDX a k u\n')
                out_file.write(
                    self.get_url('/static/my_file.txt').encode('ascii')
                )
                out_file.write(b' KQ4IUKATKL63FT5GMAE2YDRV3WERNL34')
                out_file.write(b' <under-the-deer>\n')

            args = arg_parser.parse_args([
                self.get_url('/static/my_file.txt'),
                '--no-parent',
                '--warc-file', 'test',
                '--no-warc-compression',
                '-4',
                '--no-robots',
                '--warc-dedup', 'dedup.cdx',
            ])

            builder = Builder(args)
            app = builder.build()
            exit_code = yield app.run()

            with open('test.warc', 'rb') as in_file:
                data = in_file.read()

                self.assertIn(b'KQ4IUKATKL63FT5GMAE2YDRV3WERNL34', data)
                self.assertIn(b'Type: revisit', data)
                self.assertIn(b'<under-the-deer>', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
예제 #6
0
    def test_local_encoding(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(self.get_url('/?qwerty').encode('utf-32-le'))
            in_file.write('\n'.encode('utf-32-le'))
            in_file.flush()

            opts = [
                self.get_url('/?asdf'),
                '--local-encoding', 'utf-32-le',
                '--input-file', in_file.name
            ]

            opts = [string.encode('utf-32-le') for string in opts]

            args = arg_parser.parse_args(opts)
            builder = Builder(args)

            with cd_tempdir():
                app = builder.build()
                exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)
예제 #7
0
    def test_app_args_warc(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--warc-file', 'test',
            '-4',
            '--no-robots',
            '--no-warc-digests',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

            self.assertTrue(os.path.exists('test.warc.gz'))

            with wpull.backport.gzip.GzipFile('test.warc.gz') as in_file:
                data = in_file.read()
                self.assertIn(b'FINISHED', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
예제 #8
0
    def test_timestamping_hit_orig(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/lastmod'),
            '--timestamping'
        ])

        with cd_tempdir() as temp_dir:
            filename = os.path.join(temp_dir, 'lastmod')
            filename_orig = os.path.join(temp_dir, 'lastmod')

            with open(filename, 'wb') as out_file:
                out_file.write(b'HI')

            with open(filename_orig, 'wb') as out_file:
                out_file.write(b'HI')

            os.utime(filename_orig, (631152000, 631152000))

            builder = Builder(args)
            app = builder.build()
            exit_code = yield app.run()

            self.assertEqual(0, exit_code)

            with open(filename, 'rb') as in_file:
                self.assertEqual(b'HI', in_file.read())

            with open(filename_orig, 'rb') as in_file:
                self.assertEqual(b'HI', in_file.read())
예제 #9
0
 def test_iri_handling(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([self.get_url('/static/mojibake.html')])
     with cd_tempdir():
         builder = Builder(args)
         app = builder.build()
         exit_code = yield app.run()
     self.assertEqual(0, exit_code)
예제 #10
0
 def test_app_args_post_data(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/post/'),
         '--post-data', 'text=hi',
     ])
     with cd_tempdir():
         builder = Builder(args)
         app = builder.build()
         exit_code = yield app.run()
     self.assertEqual(0, exit_code)
예제 #11
0
    def test_app_args(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            '/',
            '--base', self.get_url('/').encode('utf-8'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--database', b'test.db',
            '--server-response',
            '--random-wait',
            b'--wait', b'0.1',
            '--protocol-directories',
            '--referer', 'http://test.test',
            '--accept-regex', r'.*',
            '--header', 'Hello: world!',
            '--exclude-domains', 'asdf.invalid',
            '--exclude-hostnames', 'qwerty.invalid,uiop.invalid',
            '--no-clobber',
            '--rotate-dns',
            '-4',
            '--concurrent', '2',
            '--no-check-certificate',
            '--ascii-print',
            '--progress', 'dot',
            '--secure-protocol', 'TLSv1',
            '--convert-links', '--backup-converted',
            '--accept', '*',
            '--restrict-file-names', 'windows,lower',
            '--quota', '10m',
            '--max-filename-length', '100',
            '--user-agent', 'ΑΒΓαβγ',
            '--remote-encoding', 'latin1',
            '--http-compression',
            '--bind-address', '127.0.0.1',
        ])
        with cd_tempdir():
            builder = Builder(args)
            app = builder.build()
            exit_code = yield app.run()

            print(list(os.walk('.')))
            self.assertTrue(os.path.exists(
                'http/localhost+{0}/index.html'.format(self.get_http_port())
            ))
            self.assertTrue(os.path.exists(
                'http/localhost+{0}/index.html.orig'.format(
                    self.get_http_port())
            ))

        self.assertEqual(0, exit_code)
        self.assertEqual(builder.factory['Statistics'].files, 2)
예제 #12
0
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        script_filename = os.path.join(os.path.dirname(__file__),
                                       'testing', 'boring_script.py')
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html'),
            '--warc-file', 'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-exe', 'phantomjs',
            '--phantomjs-wait', '0.1',
            '--phantomjs-scroll', '2',
            '--header', 'accept-language: dragon',
            '--python-script', script_filename,
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

            self.assertTrue(os.path.exists('test.warc'))
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.html')
            )
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.pdf')
            )

            with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
                data = in_file.read()
                self.assertIn(b'Hello world!', data)

            with open('test.warc', 'rb') as in_file:
                data = in_file.read()

                self.assertIn(b'urn:X-wpull:snapshot?url=', data)
                self.assertIn(b'text/html', data)
                self.assertIn(b'application/pdf', data)
                self.assertIn(b'application/json', data)
                self.assertIn(b'"set_scroll_top"', data)
                try:
                    self.assertIn(b'Accept-Encoding: identity', data)
                except AssertionError:
                    # webkit treats localhost differently
                    self.assertNotIn(b'Accept-Encoding: gzip', data)
                self.assertIn(b'Accept-Language: dragon', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
예제 #13
0
    def test_app_python_script_stop(self):
        arg_parser = AppArgumentParser()
        filename = os.path.join(os.path.dirname(__file__),
                                'testing', 'py_hook_script_stop.py')
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--python-script', filename,
        ])
        with cd_tempdir():
            builder = Builder(args)
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(1, exit_code)
예제 #14
0
    def test_immediate_robots_forbidden(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/forbidden'),
            '--recursive',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
예제 #15
0
파일: __main__.py 프로젝트: mback2k/wpull
def main(exit=True):
    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()

    builder = Builder(args)
    builder.build()

    application = builder.factory['Application']
    application.setup_signal_handlers()
    exit_code = application.run_sync()

    if exit:
        sys.exit(exit_code)
    else:
        return exit_code
예제 #16
0
    def test_ignore_length(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/underrun'),
            '--ignore-length',
            '--no-robots',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
예제 #17
0
    def test_non_http_redirect(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/non_http_redirect'),
            '--recursive',
            '--no-robots'
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
예제 #18
0
    def test_long_cookie(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/long_cookie'),
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()
        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)

        cookies = list(builder.factory['CookieJar'])
        _logger.debug('{0}'.format(cookies))
        self.assertEqual(0, len(cookies))
예제 #19
0
    def test_output_document(self):
        arg_parser = AppArgumentParser()

        with cd_tempdir():
            args = arg_parser.parse_args([
                self.get_url('/'),
                '--output-document', 'blah.dat'
            ])

            builder = Builder(args)
            app = builder.build()
            exit_code = yield app.run()

            self.assertTrue(os.path.exists('blah.dat'))

        self.assertEqual(0, exit_code)
예제 #20
0
    def test_quota(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/blog/'),
            '--recursive',
            '--quota', '1',
        ])

        with cd_tempdir():
            builder = Builder(args)

            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
예제 #21
0
    def test_https_only(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/?1'),
            self.get_url('/?2').replace('https://', 'http://'),
            '--https-only',
            '--no-robots',
            '--no-check-certificate',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
예제 #22
0
    def test_bad_utf8(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/utf8_then_binary/doc.html'),
            self.get_url('/utf8_then_binary/doc.xml'),
            self.get_url('/utf8_then_binary/doc.css'),
            self.get_url('/utf8_then_binary/doc.js'),
            '--no-robots',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(4, builder.factory['Statistics'].files)
예제 #23
0
    def test_app_args_warc_with_cdx(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-parent',
            '--warc-file', 'test',
            '-4',
            '--no-robots',
            '--warc-cdx',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()
        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
예제 #24
0
    def test_many_page_with_some_fail(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/blog/'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '-4',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(ExitStatus.server_error, exit_code)
        self.assertGreater(builder.factory['Statistics'].files, 1)
        self.assertGreater(builder.factory['Statistics'].duration, 3)
예제 #25
0
    def test_content_on_error(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/always_error'),
            '--content-on-error',
        ])

        with cd_tempdir():
            builder = Builder(args)

            app = builder.build()
            exit_code = yield app.run()

            print(list(os.walk('.')))
            self.assertTrue(os.path.exists('always_error'))

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
예제 #26
0
    def test_app_input_file_arg(self):
        arg_parser = AppArgumentParser(real_exit=False)
        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(self.get_url('/').encode('utf-8'))
            in_file.write(b'\n')
            in_file.write(self.get_url('/blog/?ðfßðfëéå').encode('utf-8'))
            in_file.flush()

            args = arg_parser.parse_args([
                '--input-file', in_file.name
            ])
            with cd_tempdir():
                builder = Builder(args)
                app = builder.build()
                exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(builder.factory['Statistics'].files, 2)
예제 #27
0
    def test_redirect_diff_host_recursive(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/redirect?where=diff-host&port={0}'.format(
                self.get_http_port())),
            '--recursive',
            '--no-robots',
        ])
        builder = Builder(args)
        builder.factory.class_map['Resolver'] = MockDNSResolver

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()
        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)

        resolver = builder.factory['Resolver']
        self.assertIn('somewhereelse.invalid', resolver.hosts_touched)
예제 #28
0
    def test_immediate_robots_fail(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--recursive',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            robots_txt_pool = builder.factory['RobotsTxtPool']
            robots_txt_pool.load_robots_txt(
                URLInfo.parse(self.get_url('/')),
                'User-Agent: *\nDisallow: *\n'
            )
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
예제 #29
0
    def test_app_phantomjs_scroll(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/static/DEUUEAUGH.html'),
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-wait', '0.1',
            '--phantomjs-scroll', '20',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

            with open('DEUUEAUGH.html.snapshot.html', 'rb') as in_file:
                data = in_file.read()
                self.assertIn(b'Count: 10', data)

        self.assertEqual(0, exit_code)
예제 #30
0
    def test_redirect_span_hosts_page_requisites(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url(
                '/span_hosts?port={0}'.format(self.get_http_port())
            ),
            '--span-hosts-allow', 'page-requisites',
            '--no-robots',
            '--page-requisites',
        ])
        builder = Builder(args)
        builder.factory.class_map['Resolver'] = MockDNSResolver

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()
        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)

        resolver = builder.factory['Resolver']
        self.assertIn('pagereq.test', resolver.hosts_touched)