def test_app_args_warc_dedup(self): arg_parser = AppArgumentParser() with open('dedup.cdx', 'wb') as out_file: out_file.write(b' CDX a k u\n') out_file.write(self.get_url('/static/my_file.txt').encode('ascii')) out_file.write(b' KQ4IUKATKL63FT5GMAE2YDRV3WERNL34') out_file.write(b' <under-the-deer>\n') args = arg_parser.parse_args([ self.get_url('/static/my_file.txt'), '--no-parent', '--warc-file', 'test', '--no-warc-compression', '-4', '--no-robots', '--warc-dedup', 'dedup.cdx', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() with open('test.warc', 'rb') as in_file: data = in_file.read() self.assertIn(b'KQ4IUKATKL63FT5GMAE2YDRV3WERNL34', data) self.assertIn(b'Type: revisit', data) self.assertIn(b'<under-the-deer>', data) self.assertEqual(0, exit_code) self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
def test_app_args_warc(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/'), '--no-parent', '--recursive', '--page-requisites', '--warc-file', 'test', '-4', '--no-robots', '--no-warc-digests', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertTrue(os.path.exists('test.warc.gz')) with gzip.GzipFile('test.warc.gz') as in_file: data = in_file.read() self.assertIn(b'FINISHED', data) self.assertEqual(0, exit_code) self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
def test_ssl_bad_certificate(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/'), '--no-robots', '--no-check-certificate', '--tries', '1' ]) builder = Builder(args, unit_test=True) class MockWebSession(WebSession): @asyncio.coroutine def start(self): raise SSLVerificationError('A very bad certificate!') class MockWebClient(builder.factory.class_map['WebClient']): def session(self, request): return MockWebSession(request, self._http_client, self._redirect_tracker_factory(), Request) builder.factory.class_map['WebClient'] = MockWebClient app = builder.build() exit_code = yield from app.run() self.assertEqual(7, exit_code) self.assertEqual(0, builder.factory['Statistics'].files)
def test_app_sanity(self): arg_items = [ ('--verbose', '--quiet'), ('--timestamp', '--no-clobber'), ('--inet4-only', '--inet6-only'), ('--warc-file=test', '--no-clobber'), ('--warc-file=test', '--timestamping'), ('--warc-file=test', '--continue'), ('--no-iri', '--local-encoding=shiftjis'), ('--no-iri', '--remote-encoding=shiftjis'), ] for arg_item in arg_items: def print_(message=None): print(message) def test_exit(status=0, message=None): raise ValueError(status, message) arg_parser = AppArgumentParser() arg_parser.exit = test_exit arg_parser.print_help = print_ arg_parser.print_usage = print_ try: print(arg_item) arg_parser.parse_args(['http://example.invalid'] + list(arg_item)) except ValueError as error: self.assertEqual(2, error.args[0]) else: self.assertTrue(False)
def test_app_phantomjs_scroll(self): arg_parser = AppArgumentParser() # Change localhost into something else to test proxy args = arg_parser.parse_args([ self.get_url('/static/DEUUEAUGH.html').replace( 'localhost', 'example.invalid'), '-4', '--no-robots', '--phantomjs', '--phantomjs-wait', '0.4', '--phantomjs-scroll', '20', '--no-check-certificate', ]) builder = Builder(args, unit_test=True) builder.factory.class_map['Resolver'] = MockDNSResolver app = builder.build() exit_code = yield from app.run() with open('DEUUEAUGH.html.snapshot.html', 'rb') as in_file: data = in_file.read() self.assertIn(b'Count: 10', data) self.assertEqual(0, exit_code)
def test_big_payload(self): hash_obj = hashlib.sha1(b'foxfoxfox') payload_list = [] for dummy in range(10000): data = hash_obj.digest() hash_obj.update(data) payload_list.append(data) data = hash_obj.digest() payload_list.append(data) expected_payload = b''.join(payload_list) arg_parser = AppArgumentParser() args = arg_parser.parse_args([self.get_url('/big_payload')]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertTrue(os.path.exists('big_payload')) with open('big_payload', 'rb') as in_file: self.assertEqual(expected_payload, in_file.read()) self.assertEqual(0, exit_code) self.assertEqual(1, builder.factory['Statistics'].files)
def test_save_cookie(self): arg_parser = AppArgumentParser() with tempfile.NamedTemporaryFile() as in_file: in_file.write(b'# Kittens\n') in_file.write(b'localhost.local') in_file.write(b'\tFALSE\t/\tFALSE\t9999999999\tisloggedin\t1\n') in_file.write(b'\tFALSE\t/\tFALSE\t\tadmin\t1\n') in_file.flush() args = arg_parser.parse_args([ self.get_url('/some_page/'), '--load-cookies', in_file.name, '--tries', '1', '--save-cookies', 'wpull_test_cookies.txt', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(1, builder.factory['Statistics'].files) with open('wpull_test_cookies.txt', 'rb') as saved_file: cookie_data = saved_file.read() self.assertIn(b'isloggedin\t1', cookie_data) self.assertNotIn(b'admin\t1', cookie_data)
def test_timestamping_hit_orig(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args( [self.get_url('/lastmod'), '--timestamping']) filename = os.path.join(self.temp_dir.name, 'lastmod') filename_orig = os.path.join(self.temp_dir.name, 'lastmod') with open(filename, 'wb') as out_file: out_file.write(b'HI') with open(filename_orig, 'wb') as out_file: out_file.write(b'HI') os.utime(filename_orig, (631152000, 631152000)) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) with open(filename, 'rb') as in_file: self.assertEqual(b'HI', in_file.read()) with open(filename_orig, 'rb') as in_file: self.assertEqual(b'HI', in_file.read())
def test_app_python_plugin_script(self): arg_parser = AppArgumentParser() filename = os.path.join(os.path.dirname(__file__), 'sample_user_scripts', 'extensive.plugin.py') args = arg_parser.parse_args([ self.get_url('/'), self.get_url('/some_page'), self.get_url('/mordor'), 'localhost:1/wolf', '--plugin-script', filename, '--page-requisites', '--reject-regex', '/post/', '--wait', '12', '--retry-connrefused', '--tries', '1' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() print(list(os.walk('.'))) self.assertEqual(42, exit_code) engine = builder.factory['PipelineSeries'] self.assertEqual(2, engine.concurrency) stats = builder.factory['Statistics'] self.assertEqual(3, stats.files) # duration should be virtually 0 but account for slowness on travis ci self.assertGreater(10.0, stats.duration)
def main(exit=True, install_tornado_bridge=True, use_signals=True): if install_tornado_bridge: tornado.platform.asyncio.AsyncIOMainLoop().install() arg_parser = AppArgumentParser() args = arg_parser.parse_args() builder = Builder(args) application = builder.build() if use_signals: application.setup_signal_handlers() if args.debug_manhole: import manhole import wpull wpull.wpull_builder = builder manhole.install() exit_code = application.run_sync() if exit: sys.exit(exit_code) else: return exit_code
def test_propagate_ipv4_only_and_no_cert_check_to_youtube_dl(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ 'https://www.youtube.com/watch?v=tPEE9ZwTmy0', '--warc-file', 'test', '--debug', # to capture youtube-dl arguments in the log '--no-warc-compression', '--youtube-dl', '--inet4-only', '--no-check-certificate', '--output-file', 'test.log' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) with open('test.log', 'rb') as test_log: data = test_log.read() self.assertTrue( re.search(b'Starting process \[\'youtube-dl.*--force-ipv4', data)) self.assertTrue( re.search( b'Starting process \[\'youtube-dl.*--no-check-certificate', data))
def test_app_phantomjs(self): arg_parser = AppArgumentParser() script_filename = os.path.join(os.path.dirname(__file__), 'sample_user_scripts', 'boring.plugin.py') # Change localhost into something else to test proxy args = arg_parser.parse_args([ self.get_url('/static/simple_javascript.html').replace( 'localhost', 'example.invalid'), '--warc-file', 'test', '--no-warc-compression', '-4', '--no-robots', '--phantomjs', '--phantomjs-exe', 'phantomjs', '--phantomjs-wait', '0.1', '--phantomjs-scroll', '2', '--header', 'accept-language: dragon', '--plugin-script', script_filename, '--no-check-certificate', ]) builder = Builder(args, unit_test=True) builder.factory.class_map['Resolver'] = MockDNSResolver app = builder.build() exit_code = yield from app.run() self.assertTrue(os.path.exists('test.warc')) self.assertTrue(os.path.exists('simple_javascript.html.snapshot.html')) self.assertTrue(os.path.exists('simple_javascript.html.snapshot.pdf')) with open('simple_javascript.html.snapshot.html', 'rb') as in_file: data = in_file.read() self.assertIn(b'Hello world!', data) with open('test.warc', 'rb') as in_file: data = in_file.read() self.assertIn(b'urn:X-wpull:snapshot?url=', data) self.assertIn(b'text/html', data) self.assertIn(b'application/pdf', data) self.assertIn(b'application/json', data) self.assertIn(b'"set_scroll_top"', data) try: self.assertIn(b'Accept-Encoding: identity', data) except AssertionError: # webkit treats localhost differently self.assertNotIn(b'Accept-Encoding: gzip', data) self.assertIn(b'Accept-Language: dragon', data) self.assertEqual(0, exit_code) self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
def test_no_iri(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args( [self.get_url('/'), '--no-iri', '--no-robots']) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(1, builder.factory['Statistics'].files)
def test_app_args_post_data(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/post/'), '--post-data', 'text=hi', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code)
def test_database_uri(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args( [self.get_url('/'), '--database-uri', 'sqlite:///test.db']) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code)
def test_check_certificate(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/'), '--no-robots', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(5, exit_code)
def test_session_timeout(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/sleep_long'), '--tries=1', '--session-timeout=0.1' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(4, exit_code) self.assertEqual(0, builder.factory['Statistics'].files)
def test_referer_option(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/referrer/'), '-r', '--referer', 'http://left.shark/' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(2, builder.factory['Statistics'].files)
def test_escaped_fragment_recursive(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/escape_from_fragments/'), '-r', '--escaped-fragment' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(2, builder.factory['Statistics'].files)
def test_database_path_question_mark(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args( [self.get_url('/'), '--database', 'test?.db']) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertTrue(os.path.exists('test_.db'))
def test_login_fail(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/example (copy).txt'), '--user', 'smaug', '--password', 'hunter2', '--tries', '1' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(6, exit_code) self.assertEqual(0, builder.factory['Statistics'].files)
def test_globbing(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/read*.txt'), ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() print(list(os.walk('.'))) self.assertEqual(0, exit_code) self.assertEqual(1, builder.factory['Statistics'].files)
def test_immediate_robots_forbidden(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/forbidden'), '--recursive', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(0, builder.factory['Statistics'].files)
def test_strip_session_id(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/forum/'), '-r', '--strip-session-id', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(1, builder.factory['Statistics'].files)
def test_file_vs_directory(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/example2💎'), '--no-host-directories', '--no-remove-listing', '-r', '-l=1', '--tries=1' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() print(list(os.walk('.'))) self.assertEqual(0, exit_code) self.assertTrue(os.path.exists('example2💎/.listing'))
def test_app_python_script_stop(self): arg_parser = AppArgumentParser() filename = os.path.join(os.path.dirname(__file__), 'sample_user_scripts', 'stopper.plugin.py') args = arg_parser.parse_args([ self.get_url('/'), '--plugin-script', filename, ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(1, exit_code)
def test_referer_option_negative(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/referrer/'), '-r', '--referer', 'http://superinformation.highway/', '--tries', '1', '--waitretry', '.1' ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(0, builder.factory['Statistics'].files)
def test_output_document(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args( [self.get_url('/'), '--output-document', 'blah.dat']) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertTrue(os.path.exists('blah.dat')) self.assertTrue(os.path.getsize('blah.dat')) self.assertEqual(0, exit_code)
def test_invalid_char_dir_list(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/hidden/invalid_chars/'), '--no-host-directories', '--no-remove-listing', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() print(list(os.walk('.'))) self.assertEqual(0, exit_code) self.assertTrue(os.path.exists('.listing'))
def test_no_cache_arg(self): arg_parser = AppArgumentParser() args = arg_parser.parse_args([self.get_url('/no-cache'), '--tries=1']) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(8, exit_code) self.assertEqual(0, builder.factory['Statistics'].files) arg_parser = AppArgumentParser() args = arg_parser.parse_args([ self.get_url('/no-cache'), '--tries=1', '--no-cache', ]) builder = Builder(args, unit_test=True) app = builder.build() exit_code = yield from app.run() self.assertEqual(0, exit_code) self.assertEqual(1, builder.factory['Statistics'].files)