def __init__(self, args): self.default_user_agent = 'Wpull/{0} (gzip)'.format( wpull.version.__version__) self._args = args self._factory = Factory({ 'BatchDocumentConverter': BatchDocumentConverter, 'Client': Client, 'CookieJar': CookieJar, 'CookieJarWrapper': CookieJarWrapper, 'CookiePolicy': CookieLimitsPolicy, 'Connection': Connection, 'ConnectionPool': ConnectionPool, 'CSSScraper': CSSScraper, 'DemuxDocumentScraper': DemuxDocumentScraper, 'DemuxRecorder': DemuxRecorder, 'DemuxURLFilter': DemuxURLFilter, 'Engine': Engine, 'HostConnectionPool': HostConnectionPool, 'HTTPProxyServer': HTTPProxyServer, 'HTMLScraper': HTMLScraper, 'JavaScriptScraper': JavaScriptScraper, 'OutputDocumentRecorder': OutputDocumentRecorder, 'PathNamer': PathNamer, 'PhantomJSClient': PhantomJSClient, 'PhantomJSController': PhantomJSController, 'PrintServerResponseRecorder': PrintServerResponseRecorder, 'ProgressRecorder': ProgressRecorder, 'RedirectTracker': RedirectTracker, 'Request': Request, 'Resolver': Resolver, 'RichClient': RichClient, 'RobotsTxtPool': RobotsTxtPool, 'SitemapScraper': SitemapScraper, 'Statistics': Statistics, 'URLInfo': URLInfo, 'URLTable': URLTable, 'Waiter': LinearWaiter, 'WARCRecorder': WARCRecorder, 'WebProcessor': WebProcessor, 'WebProcessorFetchParams': WebProcessorFetchParams, 'WebProcessorInstances': WebProcessorInstances, }) self._url_infos = tuple(self._build_input_urls()) self._ca_certs_file = None self._file_log_handler = None self._console_log_handler = None
def test_factory(self): factory = Factory() factory.set('dict', dict) self.assertNotIn('dict', factory) self.assertFalse(factory.is_all_initialized()) my_instance = factory.new('dict', [('hi', 'hello')]) self.assertIn('dict', factory) self.assertEqual(my_instance, factory['dict']) self.assertTrue(factory.is_all_initialized()) self.assertEqual(1, len(factory)) self.assertEqual(['dict'], list(iter(factory))) self.assertEqual(my_instance, factory.instance_map['dict']) with self.assertRaises(ValueError): factory.new('dict', [('hi', 'hello')])
def test_factory(self): factory = Factory() factory.set('dict', dict) self.assertNotIn('dict', factory) self.assertFalse(factory.is_all_initialized()) my_instance = factory.new('dict', [('hi', 'hello')]) self.assertIn('dict', factory) self.assertEqual(my_instance, factory['dict']) self.assertTrue(factory.is_all_initialized())
def __init__(self, args): self.default_user_agent = 'Wpull/{0} (gzip)'.format( wpull.version.__version__) self._args = args self._factory = Factory({ 'Application': Application, 'BatchDocumentConverter': BatchDocumentConverter, 'Client': Client, 'CookieJar': CookieJar, 'CookieJarWrapper': CookieJarWrapper, 'CookiePolicy': DeFactoCookiePolicy, 'Connection': Connection, 'ConnectionPool': ConnectionPool, 'CSSScraper': CSSScraper, 'DemuxDocumentScraper': DemuxDocumentScraper, 'DemuxRecorder': DemuxRecorder, 'DemuxURLFilter': DemuxURLFilter, 'Engine': Engine, 'HostConnectionPool': HostConnectionPool, 'HTTPProxyServer': HTTPProxyServer, 'HTMLScraper': HTMLScraper, 'JavaScriptScraper': JavaScriptScraper, 'OutputDocumentRecorder': OutputDocumentRecorder, 'PathNamer': PathNamer, 'PhantomJSClient': PhantomJSClient, 'PhantomJSController': PhantomJSController, 'PrintServerResponseRecorder': PrintServerResponseRecorder, 'ProgressRecorder': ProgressRecorder, 'RedirectTracker': RedirectTracker, 'Request': Request, 'Resolver': Resolver, 'RichClient': RichClient, 'RobotsTxtPool': RobotsTxtPool, 'SitemapScraper': SitemapScraper, 'Statistics': Statistics, 'URLInfo': URLInfo, 'URLTable': URLTable, 'Waiter': LinearWaiter, 'WARCRecorder': WARCRecorder, 'WebProcessor': WebProcessor, 'WebProcessorFetchParams': WebProcessorFetchParams, 'WebProcessorInstances': WebProcessorInstances, }) self._url_infos = tuple(self._build_input_urls()) self._ca_certs_file = None self._file_log_handler = None self._console_log_handler = None
def test_factory(self): factory = Factory() factory.set("dict", dict) self.assertNotIn("dict", factory) self.assertFalse(factory.is_all_initialized()) my_instance = factory.new("dict", [("hi", "hello")]) self.assertIn("dict", factory) self.assertEqual(my_instance, factory["dict"]) self.assertTrue(factory.is_all_initialized()) self.assertEqual(1, len(factory)) self.assertEqual(["dict"], list(iter(factory))) self.assertEqual(my_instance, factory.instance_map["dict"]) with self.assertRaises(ValueError): factory.new("dict", [("hi", "hello")])
class Builder(object): '''Application builder. Args: args: Options from :class:`argparse.ArgumentParser` ''' UNSAFE_OPTIONS = frozenset(['save_headers', 'no_iri', 'output_document']) def __init__(self, args): self.default_user_agent = 'Wpull/{0} (gzip)'.format( wpull.version.__version__) self._args = args self._factory = Factory({ 'BatchDocumentConverter': BatchDocumentConverter, 'Client': Client, 'CookieJar': CookieJar, 'CookieJarWrapper': CookieJarWrapper, 'CookiePolicy': CookieLimitsPolicy, 'Connection': Connection, 'ConnectionPool': ConnectionPool, 'CSSScraper': CSSScraper, 'DemuxDocumentScraper': DemuxDocumentScraper, 'DemuxRecorder': DemuxRecorder, 'DemuxURLFilter': DemuxURLFilter, 'Engine': Engine, 'HostConnectionPool': HostConnectionPool, 'HTTPProxyServer': HTTPProxyServer, 'HTMLScraper': HTMLScraper, 'JavaScriptScraper': JavaScriptScraper, 'OutputDocumentRecorder': OutputDocumentRecorder, 'PathNamer': PathNamer, 'PhantomJSClient': PhantomJSClient, 'PhantomJSController': PhantomJSController, 'PrintServerResponseRecorder': PrintServerResponseRecorder, 'ProgressRecorder': ProgressRecorder, 'RedirectTracker': RedirectTracker, 'Request': Request, 'Resolver': Resolver, 'RichClient': RichClient, 'RobotsTxtPool': RobotsTxtPool, 'SitemapScraper': SitemapScraper, 'Statistics': Statistics, 'URLInfo': URLInfo, 'URLTable': URLTable, 'Waiter': LinearWaiter, 'WARCRecorder': WARCRecorder, 'WebProcessor': WebProcessor, 'WebProcessorFetchParams': WebProcessorFetchParams, 'WebProcessorInstances': WebProcessorInstances, }) self._url_infos = tuple(self._build_input_urls()) self._ca_certs_file = None self._file_log_handler = None self._console_log_handler = None @property def factory(self): '''Return the Factory. Returns: Factory: An :class:`.factory.Factory` instance. ''' return self._factory def build(self): '''Put the application together. Returns: Engine: An instance of :class:`.engine.Engine`. ''' self._setup_logging() self._setup_console_logger() self._setup_file_logger() self._setup_debug_console() self._install_script_hooks() self._warn_unsafe_options() self._warn_silly_options() statistics = self._factory.new('Statistics') statistics.quota = self._args.quota statistics.required_url_infos.update(self._url_infos) url_table = self._build_url_table() processor = self._build_processor() engine = self._factory.new('Engine', url_table, processor, statistics, concurrent=self._args.concurrent, ) self._setup_file_logger_close(engine) self._setup_console_logger_close(engine) return engine def build_and_run(self): '''Build and run the application. Returns: int: The exit status. ''' io_loop = tornado.ioloop.IOLoop.current() engine = self.build() exit_code = io_loop.run_sync(engine) return exit_code def _new_encoded_stream(self, stream): '''Return a stream writer.''' if self._args.ascii_print: return ASCIIStreamWriter(stream) else: return stream def _setup_logging(self): '''Set up the root logger if needed. The root logger is set the appropriate level so the file and WARC logs work correctly. ''' assert ( logging.CRITICAL > logging.ERROR > logging.WARNING > logging.INFO > logging.DEBUG > logging.NOTSET ) assert self._args.verbosity root_logger = logging.getLogger() current_level = root_logger.getEffectiveLevel() min_level = logging.ERROR if self._args.verbosity == logging.WARNING: min_level = logging.WARNING if self._args.verbosity == logging.INFO \ or self._args.warc_file \ or self._args.output_file or self._args.append_output: min_level = logging.INFO if self._args.verbosity == logging.DEBUG: min_level = logging.DEBUG if current_level > min_level: root_logger.setLevel(min_level) root_logger.debug( 'Wpull needs the root logger level set to {0}.'\ .format(min_level) ) def _setup_console_logger(self): '''Set up the console logger. A handler and with a formatter is added to the root logger. ''' stream = self._new_encoded_stream(sys.stderr) logger = logging.getLogger() self._console_log_handler = handler = logging.StreamHandler(stream) formatter = logging.Formatter('%(levelname)s %(message)s') handler.setFormatter(formatter) handler.setLevel(self._args.verbosity or logging.INFO) logger.addHandler(handler) def _setup_console_logger_close(self, engine): '''Add routine to remove log handler when the engine stops.''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._console_log_handler) self._console_log_handler = None if self._console_log_handler: engine.stop_event.handle(remove_handler) def _setup_file_logger(self): '''Set up the file message logger. A file log handler and with a formatter is added to the root logger. ''' args = self._args if not (args.output_file or args.append_output): return logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.output_file: filename = args.output_file mode = 'w' else: filename = args.append_output mode = 'a' self._file_log_handler = handler = logging.FileHandler( filename, mode, encoding='utf-8') handler.setFormatter(formatter) logger.addHandler(handler) if args.verbosity == logging.DEBUG: handler.setLevel(logging.DEBUG) else: handler.setLevel(logging.INFO) def _setup_file_logger_close(self, engine): '''Add routine that removes the file log handler when the engine stops. ''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._file_log_handler) self._file_log_handler = None if self._file_log_handler: engine.stop_event.handle(remove_handler) def _install_script_hooks(self): '''Set up the scripts if any.''' if self._args.python_script: self._install_python_script(self._args.python_script) elif self._args.lua_script: self._install_lua_script(self._args.lua_script) def _install_python_script(self, filename): '''Load the Python script into an environment.''' _logger.info(_('Using Python hook script {filename}.').format( filename=filename)) hook_environment = HookEnvironment(self._factory) self._setup_hook_environment(hook_environment) with open(filename, 'rb') as in_file: code = compile(in_file.read(), filename, 'exec') context = {'wpull_hook': hook_environment} exec(code, context, context) def _install_lua_script(self, filename): '''Load the Lua script into an environment.''' _logger.info(_('Using Lua hook script {filename}.').format( filename=filename)) lua = wpull.hook.load_lua() hook_environment = HookEnvironment(self._factory, is_lua=True) self._setup_hook_environment(hook_environment) lua_globals = lua.globals() lua_globals.wpull_hook = hook_environment with open(filename, 'rb') as in_file: lua.execute(in_file.read()) def _setup_hook_environment(self, hook_environment): '''Override the classes needed for script hooks. Args: hook_environment: A :class:`.hook.HookEnvironment` instance ''' self._factory.set('Engine', hook_environment.engine_factory) self._factory.set('WebProcessor', hook_environment.web_processor_factory) self._factory.set('Resolver', hook_environment.resolver_factory) def _setup_debug_console(self): if not self._args.debug_console_port: return _logger.warning( _('Opened a debug console at localhost:{port}.')\ .format(port=self._args.debug_console_port) ) application = tornado.web.Application( [(r'/', DebugConsoleHandler)], builder=self ) http_server = tornado.httpserver.HTTPServer(application) http_server.listen(self._args.debug_console_port, address='localhost') def _build_input_urls(self, default_scheme='http'): '''Read the URLs provided by the user.''' url_string_iter = self._args.urls or () if self._args.input_file: if self._args.force_html: urls = self._read_input_file_as_html() else: urls = self._read_input_file_as_lines() url_string_iter = itertools.chain(url_string_iter, urls) sitemap_url_infos = set() base_url = self._args.base for url_string in url_string_iter: _logger.debug('Parsing URL {0}'.format(url_string)) if base_url: url_string = wpull.url.urljoin(base_url, url_string) url_info = self._factory.class_map['URLInfo'].parse( url_string, default_scheme=default_scheme) _logger.debug('Parsed URL {0}'.format(url_info)) yield url_info if self._args.sitemaps: sitemap_url_infos.update(( URLInfo.parse( '{0}://{1}/robots.txt'.format(url_info.scheme, url_info.hostname_with_port) ), URLInfo.parse( '{0}://{1}/sitemap.xml'.format(url_info.scheme, url_info.hostname_with_port) ) )) for url_info in sitemap_url_infos: yield url_info def _read_input_file_as_lines(self): '''Read lines from input file and return them.''' input_file = codecs.getreader( self._args.local_encoding or 'utf-8')(self._args.input_file) urls = [line.strip() for line in input_file if line.strip()] if not urls: raise ValueError(_('No URLs found in input file.')) return urls def _read_input_file_as_html(self): '''Read input file as HTML and return the links.''' scrape_info = HTMLScraper.scrape_file( self._args.input_file, encoding=self._args.local_encoding or 'utf-8' ) links = itertools.chain( scrape_info['inline_urls'], scrape_info['linked_urls'] ) return links def _build_url_filters(self): '''Create the URL filter instances. Returns: A list of URL filter instances ''' args = self._args filters = [ HTTPSOnlyFilter() if args.https_only else HTTPFilter(), RecursiveFilter( enabled=args.recursive, page_requisites=args.page_requisites ), SpanHostsFilter( self._url_infos, enabled=args.span_hosts, page_requisites='page-requisites' in args.span_hosts_allow, linked_pages='linked-pages' in args.span_hosts_allow, ), ] if args.no_parent: filters.append(ParentFilter()) if args.domains or args.exclude_domains: filters.append( BackwardDomainFilter(args.domains, args.exclude_domains) ) if args.hostnames or args.exclude_hostnames: filters.append( HostnameFilter(args.hostnames, args.exclude_hostnames) ) if args.tries: filters.append(TriesFilter(args.tries)) if args.level and args.recursive: filters.append(LevelFilter(args.level)) if args.accept_regex or args.reject_regex: filters.append(RegexFilter(args.accept_regex, args.reject_regex)) if args.include_directories or args.exclude_directories: filters.append( DirectoryFilter( args.include_directories, args.exclude_directories ) ) if args.accept or args.reject: filters.append(BackwardFilenameFilter(args.accept, args.reject)) return filters def _build_document_scrapers(self): '''Create the document scrapers. Returns: A list of document scrapers ''' scrapers = [ self._factory.new( 'HTMLScraper', followed_tags=self._args.follow_tags, ignored_tags=self._args.ignore_tags, only_relative=self._args.relative, robots=self._args.robots, encoding_override=self._args.remote_encoding, ), self._factory.new( 'CSSScraper', encoding_override=self._args.remote_encoding, ), self._factory.new( 'JavaScriptScraper', encoding_override=self._args.remote_encoding, ), ] if self._args.sitemaps: scrapers.append(self._factory.new( 'SitemapScraper', encoding_override=self._args.remote_encoding, )) return scrapers def _build_url_table(self): '''Create the URL table. Returns: URLTable: An instance of :class:`.database.BaseURLTable`. ''' url_table = self._factory.new('URLTable', path=self._args.database) url_table.add([url_info.url for url_info in self._url_infos]) return url_table def _build_recorder(self): '''Create the Recorder. Returns: DemuxRecorder: An instance of :class:`.recorder.DemuxRecorder`. ''' args = self._args recorders = [] if args.warc_file: extra_fields = [ ('robots', 'on' if args.robots else 'off'), ('wpull-arguments', str(args)) ] for header_string in args.warc_header: name, value = header_string.split(':', 1) name = name.strip() value = value.strip() extra_fields.append((name, value)) software_string = WARCRecorder.DEFAULT_SOFTWARE_STRING if args.phantomjs: software_string += ' PhantomJS/{0}'.format( wpull.phantomjs.get_version() ) recorders.append( self._factory.new('WARCRecorder', args.warc_file, params=WARCRecorderParams( compress=not args.no_warc_compression, extra_fields=extra_fields, temp_dir=args.warc_tempdir, log=args.warc_log, appending=args.warc_append, digests=args.warc_digests, cdx=args.warc_cdx, max_size=args.warc_max_size, url_table=self._factory['URLTable'] if args.warc_dedup else None, software_string=software_string, ), ) ) if args.server_response: recorders.append(self._factory.new('PrintServerResponseRecorder')) assert args.verbosity if args.verbosity in (logging.INFO, logging.DEBUG, logging.WARNING): stream = self._new_encoded_stream(sys.stderr) bar_style = args.progress == 'bar' if not stream.isatty(): bar_style = False recorders.append(self._factory.new('ProgressRecorder', bar_style=bar_style, stream=stream)) if args.warc_dedup: self._populate_visits() if args.output_document: recorders.append(self._factory.new( 'OutputDocumentRecorder', args.output_document, with_headers=args.save_headers, )) return self._factory.new('DemuxRecorder', recorders) def _populate_visits(self): '''Populate the visits from the CDX into the URL table.''' iterable = wpull.warc.read_cdx( self._args.warc_dedup, encoding=self._args.local_encoding or 'utf-8' ) missing_url_msg = _('The URL ("a") is missing from the CDX file.') missing_id_msg = _('The record ID ("u") is missing from the CDX file.') missing_checksum_msg = \ _('The SHA1 checksum ("k") is missing from the CDX file.') nonlocal_var = {'counter': 0} def visits(): checked_fields = False for record in iterable: if not checked_fields: if 'a' not in record: raise ValueError(missing_url_msg) if 'u' not in record: raise ValueError(missing_id_msg) if 'k' not in record: raise ValueError(missing_checksum_msg) checked_fields = True yield record['a'], record['u'], record['k'] nonlocal_var['counter'] += 1 url_table = self.factory['URLTable'] url_table.add_visits(visits()) _logger.info( gettext.ngettext( 'Loaded {num} record from CDX file.', 'Loaded {num} records from CDX file.', nonlocal_var['counter'] ).format(num=nonlocal_var['counter']) ) def _build_processor(self): '''Create the Processor Returns: Processor: An instance of :class:`.processor.BaseProcessor`. ''' args = self._args url_filter = self._factory.new('DemuxURLFilter', self._build_url_filters()) document_scraper = self._factory.new('DemuxDocumentScraper', self._build_document_scrapers()) file_writer = self._build_file_writer() post_data = self._get_post_data() converter = self._build_document_converter() rich_http_client = self._build_rich_http_client() phantomjs_controller = self._build_phantomjs_controller() waiter = self._factory.new('Waiter', wait=args.wait, random_wait=args.random_wait, max_wait=args.waitretry ) web_processor_instances = self._factory.new( 'WebProcessorInstances', url_filter=url_filter, document_scraper=document_scraper, file_writer=file_writer, waiter=waiter, statistics=self._factory['Statistics'], converter=converter, phantomjs_controller=phantomjs_controller, ) web_processor_fetch_params = self._factory.new( 'WebProcessorFetchParams', retry_connrefused=args.retry_connrefused, retry_dns_error=args.retry_dns_error, post_data=post_data, strong_redirects=args.strong_redirects, content_on_error=args.content_on_error, ) processor = self._factory.new('WebProcessor', rich_http_client, args.directory_prefix, web_processor_fetch_params, web_processor_instances ) return processor def _build_file_writer(self): '''Create the File Writer. Returns: FileWriter: An instance of :class:`.writer.BaseFileWriter`. ''' args = self._args if args.delete_after or args.output_document: return NullWriter() use_dir = (len(args.urls) != 1 or args.page_requisites \ or args.recursive) if args.use_directories == 'force': use_dir = True elif args.use_directories == 'no': use_dir = False os_type = 'windows' if 'windows' in args.restrict_file_names \ else 'unix' ascii_only = 'ascii' in args.restrict_file_names no_control = 'nocontrol' not in args.restrict_file_names if 'lower' in args.restrict_file_names: case = 'lower' elif 'upper' in args.restrict_file_names: case = 'upper' else: case = None path_namer = self._factory.new('PathNamer', args.directory_prefix, index=args.default_page, use_dir=use_dir, cut=args.cut_dirs, protocol=args.protocol_directories, hostname=args.host_directories, os_type=os_type, ascii_only=ascii_only, no_control=no_control, case=case, max_filename_length=args.max_filename_length, ) if args.recursive or args.page_requisites or args.continue_download: if args.clobber_method == 'disable': file_class = OverwriteFileWriter else: file_class = IgnoreFileWriter elif args.timestamping: file_class = TimestampingFileWriter else: file_class = AntiClobberFileWriter return file_class( path_namer, file_continuing=args.continue_download, headers_included=args.save_headers, local_timestamping=args.use_server_timestamps ) def _get_post_data(self): '''Return the post data.''' if self._args.post_data: return self._args.post_data elif self._args.post_file: return self._args.post_file.read() def _build_request_factory(self): '''Create the request factory. A request factory is any callable object that returns a :class:`.http.Request`. The callable must accept the same arguments to Request. Returns: A callable object ''' def request_factory(*args, **kwargs): request = self._factory.class_map['Request'].new(*args, **kwargs) user_agent = self._args.user_agent or self.default_user_agent request.fields['User-Agent'] = user_agent if self._args.referer: request.fields['Referer'] = self._args.referer for header_string in self._args.header: request.fields.parse(header_string) if self._args.http_compression: request.fields['Accept-Encoding'] = 'gzip, deflate' return request return request_factory def _build_http_client(self): '''Create the HTTP client. Returns: Client: An instance of :class:`.http.Client`. ''' args = self._args dns_timeout = args.dns_timeout connect_timeout = args.connect_timeout read_timeout = args.read_timeout if args.timeout: dns_timeout = connect_timeout = read_timeout = args.timeout if args.inet_family == 'IPv4': families = [Resolver.IPv4] elif args.inet_family == 'IPv6': families = [Resolver.IPv6] elif args.prefer_family == 'IPv6': families = [Resolver.IPv6, Resolver.IPv4] else: families = [Resolver.IPv4, Resolver.IPv6] resolver = self._factory.new('Resolver', families=families, timeout=dns_timeout, rotate=args.rotate_dns, cache_enabled=args.dns_cache, ) if self._args.bind_address: bind_address = (self._args.bind_address, 0) else: bind_address = None def connection_factory(*args, **kwargs): return self._factory.new('Connection', *args, resolver=resolver, params=ConnectionParams( connect_timeout=connect_timeout, read_timeout=read_timeout, keep_alive=( self._args.http_keep_alive and not self._args.ignore_length ), ssl_options=self._build_ssl_options(), ignore_length=self._args.ignore_length, bind_address=bind_address, ), **kwargs) def host_connection_pool_factory(*args, **kwargs): return self._factory.new('HostConnectionPool', *args, connection_factory=connection_factory, **kwargs) connection_pool = self._factory.new('ConnectionPool', host_connection_pool_factory=host_connection_pool_factory) recorder = self._build_recorder() return self._factory.new('Client', connection_pool=connection_pool, recorder=recorder) def _build_rich_http_client(self): '''Build Rich Client.''' cookie_jar = self._build_cookie_jar() http_client = self._build_http_client() if self._args.robots: robots_txt_pool = self._factory.new('RobotsTxtPool') else: robots_txt_pool = None redirect_factory = functools.partial( self._factory.class_map['RedirectTracker'], max_redirects=self._args.max_redirect ) return self._factory.new( 'RichClient', http_client, robots_txt_pool=robots_txt_pool, redirect_tracker_factory=redirect_factory, cookie_jar=cookie_jar, request_factory=self._build_request_factory(), ) def _build_cookie_jar(self): '''Build the cookie jar''' if not self._args.cookies: return if self._args.load_cookies or self._args.save_cookies: self._factory.set('CookieJar', RelaxedMozillaCookieJar) cookie_jar = self._factory.new('CookieJar') if self._args.load_cookies: cookie_jar.load(self._args.load_cookies, ignore_discard=True) else: cookie_jar = self._factory.new('CookieJar') policy = self._factory.new('CookiePolicy', cookie_jar=cookie_jar) cookie_jar.set_policy(policy) _logger.debug('Loaded cookies: {0}'.format(list(cookie_jar))) cookie_jar_wrapper = self._factory.new( 'CookieJarWrapper', cookie_jar, save_filename=self._args.save_cookies, keep_session_cookies=True, ) return cookie_jar_wrapper def _build_document_converter(self): '''Build the Document Converter.''' if not self._args.convert_links: return converter = self._factory.new( 'BatchDocumentConverter', self._factory['URLTable'], backup=self._args.backup_converted ) return converter def _build_phantomjs_controller(self): '''Build proxy server and PhantomJS client and controller.''' if not self._args.phantomjs: return proxy_server = self._factory.new( 'HTTPProxyServer', self.factory['Client'] ) proxy_socket, proxy_port = tornado.testing.bind_unused_port() proxy_server.add_socket(proxy_socket) page_settings = {} default_headers = NameValueRecord() for header_string in self._args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not self._args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if self._args.read_timeout: page_settings['resourceTimeout'] = self._args.read_timeout * 1000 page_settings['userAgent'] = self._args.user_agent \ or self.default_user_agent phantomjs_client = self._factory.new( 'PhantomJSClient', 'localhost:{0}'.format(proxy_port), page_settings=page_settings, default_headers=default_headers, ) phantomjs_client.test_client_exe() phantomjs_controller = self._factory.new( 'PhantomJSController', phantomjs_client, wait_time=self._args.phantomjs_wait, num_scrolls=self._args.phantomjs_scroll, warc_recorder=self.factory.get('WARCRecorder'), smart_scroll=self._args.phantomjs_smart_scroll, snapshot=self._args.phantomjs_snapshot, ) return phantomjs_controller def _build_ssl_options(self): '''Create the SSL options. The options must be accepted by the `ssl` module. Returns: dict ''' ssl_options = {} if self._args.check_certificate: ssl_options['cert_reqs'] = ssl.CERT_REQUIRED ssl_options['ca_certs'] = self._load_ca_certs() else: ssl_options['cert_reqs'] = ssl.CERT_NONE ssl_options['ssl_version'] = self._args.secure_protocol if self._args.certificate: ssl_options['certfile'] = self._args.certificate ssl_options['keyfile'] = self._args.private_key if self._args.edg_file: ssl.RAND_egd(self._args.edg_file) if self._args.random_file: with open(self._args.random_file, 'rb') as in_file: # Use 16KB because Wget ssl.RAND_add(in_file.read(15360), 0.0) return ssl_options def _load_ca_certs(self): '''Load the Certificate Authority certificates. Returns: A filename to the bundled CA certs. ''' if self._ca_certs_file: return self._ca_certs_file certs = set() if self._args.use_internal_ca_certs: pem_filename = os.path.join( os.path.dirname(__file__), 'cert', 'ca-bundle.pem' ) certs.update(self._read_pem_file(pem_filename, from_package=True)) if self._args.ca_directory: for filename in os.listdir(self._args.ca_directory): if os.path.isfile(filename): certs.update(self._read_pem_file(filename)) if self._args.ca_certificate: certs.update(self._read_pem_file(self._args.ca_certificate)) self._ca_certs_file = certs_filename = tempfile.mkstemp()[1] def clean_certs_file(): os.remove(certs_filename) atexit.register(clean_certs_file) with open(certs_filename, 'w+b') as certs_file: for cert in certs: certs_file.write(cert) _logger.debug('CA certs loaded.') return certs_filename def _read_pem_file(self, filename, from_package=False): '''Read the PEM file. Returns: iterable: An iterable of certificates. The certificate data is :class:`byte`. ''' _logger.debug('Reading PEM {0}.'.format(filename)) if from_package: return wpull.util.filter_pem(wpull.util.get_package_data(filename)) with open(filename, 'rb') as in_file: return wpull.util.filter_pem(in_file.read()) def _warn_silly_options(self): '''Print warnings about any options that may be silly.''' if 'page-requisites' in self._args.span_hosts_allow \ and not self._args.page_requisites: _logger.warning( _('Spanning hosts is allowed for page requisites, ' 'but the page requisites option is not on.') ) if 'linked-pages' in self._args.span_hosts_allow \ and not self._args.recursive: _logger.warning( _('Spanning hosts is allowed for linked pages, ' 'but the recursive option is not on.') ) def _warn_unsafe_options(self): '''Print warnings about any enabled hazardous options. This function will print messages complaining about: * ``--save-headers`` * ``--no-iri`` * ``--output-document`` ''' enabled_options = [] for option_name in self.UNSAFE_OPTIONS: if getattr(self._args, option_name): enabled_options.append(option_name) if enabled_options: _logger.warning( _('The following unsafe options are enabled: {list}.')\ .format(list=enabled_options) ) _logger.warning( _('The use of unsafe options may lead to unexpected behavior ' 'or file corruption.'))
class Builder(object): '''Application builder. Args: args: Options from :class:`argparse.ArgumentParser` ''' UNSAFE_OPTIONS = frozenset(['save_headers']) def __init__(self, args): self.default_user_agent = 'Mozilla/5.0 (compatible) Wpull/{0}'.format( wpull.version.__version__) self._args = args self._factory = Factory({ 'BatchDocumentConverter': BatchDocumentConverter, 'Client': Client, 'CookieJar': CookieJar, 'CookieJarWrapper': CookieJarWrapper, 'Connection': Connection, 'ConnectionPool': ConnectionPool, 'CSSScraper': CSSScraper, 'DemuxDocumentScraper': DemuxDocumentScraper, 'DemuxRecorder': DemuxRecorder, 'DemuxURLFilter': DemuxURLFilter, 'Engine': Engine, 'HostConnectionPool': HostConnectionPool, 'HTTPProxyServer': HTTPProxyServer, 'HTMLScraper': HTMLScraper, 'PathNamer': PathNamer, 'PhantomJSClient': PhantomJSClient, 'PhantomJSController': PhantomJSController, 'PrintServerResponseRecorder': PrintServerResponseRecorder, 'ProgressRecorder': ProgressRecorder, 'RedirectTracker': RedirectTracker, 'Request': Request, 'Resolver': Resolver, 'RichClient': RichClient, 'RobotsTxtPool': RobotsTxtPool, 'Statistics': Statistics, 'URLInfo': URLInfo, 'URLTable': URLTable, 'Waiter': LinearWaiter, 'WARCRecorder': WARCRecorder, 'WebProcessor': WebProcessor, }) self._url_infos = tuple(self._build_input_urls()) self._ca_certs_file = None self._file_log_handler = None self._console_log_handler = None @property def factory(self): '''Return the Factory. Returns: Factory: An :class:`.factory.Factory` instance. ''' return self._factory def build(self): '''Put the application together. Returns: Engine: An instance of :class:`.engine.Engine`. ''' self._setup_logging() self._setup_console_logger() self._setup_file_logger() self._install_script_hooks() self._warn_unsafe_options() statisics = self._factory.new('Statistics') url_table = self._build_url_table() processor = self._build_processor() engine = self._factory.new( 'Engine', url_table, processor, statisics, concurrent=self._args.concurrent, ) self._setup_file_logger_close(engine) self._setup_console_logger_close(engine) return engine def build_and_run(self): '''Build and run the application. Returns: int: The exit status. ''' io_loop = tornado.ioloop.IOLoop.current() engine = self.build() exit_code = io_loop.run_sync(engine) return exit_code def _new_encoded_stream(self, stream): '''Return a stream writer.''' if self._args.ascii_print: return ASCIIStreamWriter(stream) else: return stream def _setup_logging(self): '''Set up the root logger if needed. The root logger is set to DEBUG level so the file and WARC logs work correctly. ''' root_logger = logging.getLogger() root_logger.setLevel(logging.DEBUG) root_logger.debug('Wpull needs the root logger level set to DEBUG.') def _setup_console_logger(self): '''Set up the console logger. A handler and with a formatter is added to the root logger. ''' if self._args.verbosity == logging.DEBUG: tornado.ioloop.IOLoop.current().set_blocking_log_threshold(5) stream = self._new_encoded_stream(sys.stderr) logger = logging.getLogger() self._console_log_handler = handler = logging.StreamHandler(stream) formatter = logging.Formatter('%(levelname)s %(message)s') handler.setFormatter(formatter) handler.setLevel(self._args.verbosity or logging.INFO) logger.addHandler(handler) def _setup_console_logger_close(self, engine): '''Add routine to remove log handler when the engine stops.''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._console_log_handler) self._console_log_handler = None if self._console_log_handler: engine.stop_event.handle(remove_handler) def _setup_file_logger(self): '''Set up the file message logger. A file log handler and with a formatter is added to the root logger. ''' args = self._args if not (args.output_file or args.append_output): return logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.output_file: filename = args.output_file mode = 'w' else: filename = args.append_output mode = 'a' self._file_log_handler = handler = logging.FileHandler( filename, mode, encoding='utf-8') handler.setFormatter(formatter) logger.addHandler(handler) if args.verbosity == logging.DEBUG: handler.setLevel(logging.DEBUG) else: handler.setLevel(logging.INFO) def _setup_file_logger_close(self, engine): '''Add routine that removes the file log handler when the engine stops. ''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._file_log_handler) self._file_log_handler = None if self._file_log_handler: engine.stop_event.handle(remove_handler) def _install_script_hooks(self): '''Set up the scripts if any.''' if self._args.python_script: self._install_python_script(self._args.python_script) elif self._args.lua_script: self._install_lua_script(self._args.lua_script) def _install_python_script(self, filename): '''Load the Python script into an environment.''' _logger.info( _('Using Python hook script {filename}.').format( filename=filename)) hook_environment = HookEnvironment() self._setup_hook_environment(hook_environment) with open(filename, 'rb') as in_file: code = compile(in_file.read(), filename, 'exec') context = {'wpull_hook': hook_environment} exec(code, context, context) def _install_lua_script(self, filename): '''Load the Lua script into an environment.''' _logger.info( _('Using Lua hook script {filename}.').format(filename=filename)) lua = wpull.hook.load_lua() hook_environment = HookEnvironment(is_lua=True) self._setup_hook_environment(hook_environment) lua_globals = lua.globals() lua_globals.wpull_hook = hook_environment with open(filename, 'rb') as in_file: lua.execute(in_file.read()) def _setup_hook_environment(self, hook_environment): '''Override the classes needed for script hooks. Args: hook_environment: A :class:`.hook.HookEnvironment` instance ''' self._factory.set('Engine', hook_environment.engine_factory) self._factory.set('WebProcessor', hook_environment.web_processor_factory) self._factory.set('Resolver', hook_environment.resolver_factory) def _build_input_urls(self, default_scheme='http'): '''Read the URLs provided by the user.''' if self._args.input_file: urls = wpull.util.to_str( tuple([ line.strip() for line in self._args.input_file if line.strip() ])) if not urls: raise ValueError(_('No URLs found in input file.')) url_string_iter = itertools.chain(urls, self._args.input_file) else: url_string_iter = self._args.urls for url_string in url_string_iter: url_info = self._factory.class_map['URLInfo'].parse( url_string, default_scheme=default_scheme) _logger.debug('Parsed URL {0}'.format(url_info)) yield url_info def _build_url_filters(self): '''Create the URL filter instances. Returns: A list of URL filter instances ''' args = self._args filters = [ HTTPFilter(), BackwardDomainFilter(args.domains, args.exclude_domains), HostnameFilter(args.hostnames, args.exclude_hostnames), TriesFilter(args.tries), RecursiveFilter(args.recursive, args.page_requisites), LevelFilter(args.level), SpanHostsFilter(self._url_infos, enabled=not args.recursive or args.span_hosts), RegexFilter(args.accept_regex, args.reject_regex), DirectoryFilter(args.include_directories, args.exclude_directories), BackwardFilenameFilter(args.accept, args.reject), ] if args.no_parent: filters.append(ParentFilter()) return filters def _build_document_scrapers(self): '''Create the document scrapers. Returns: A list of document scrapers ''' scrapers = [ self._factory.new( 'HTMLScraper', followed_tags=self._args.follow_tags, ignored_tags=self._args.ignore_tags, only_relative=self._args.relative, robots=self._args.robots, ), self._factory.new('CSSScraper'), ] return scrapers def _build_url_table(self): '''Create the URL table. Returns: URLTable: An instance of :class:`.database.BaseURLTable`. ''' url_table = self._factory.new('URLTable', path=self._args.database) url_table.add([url_info.url for url_info in self._url_infos]) return url_table def _build_recorder(self): '''Create the Recorder. Returns: DemuxRecorder: An instance of :class:`.recorder.DemuxRecorder`. ''' args = self._args recorders = [] if args.warc_file: if args.no_warc_compression: warc_path = args.warc_file + '.warc' else: warc_path = args.warc_file + '.warc.gz' if args.warc_cdx: cdx_path = args.warc_file + '.cdx' else: cdx_path = None extra_fields = [('robots', 'on' if args.robots else 'off'), ('wpull-arguments', str(args))] for header_string in args.warc_header: name, value = header_string.split(':', 1) name = name.strip() value = value.strip() extra_fields.append((name, value)) recorders.append( self._factory.new( 'WARCRecorder', warc_path, compress=not args.no_warc_compression, extra_fields=extra_fields, temp_dir=args.warc_tempdir, log=args.warc_log, appending=args.warc_append, digests=args.warc_digests, cdx_filename=cdx_path, )) if args.server_response: recorders.append(self._factory.new('PrintServerResponseRecorder')) if args.verbosity in (logging.INFO, logging.DEBUG, logging.WARN, None): stream = self._new_encoded_stream(sys.stderr) bar_style = args.progress == 'bar' if not stream.isatty(): bar_style = False recorders.append( self._factory.new('ProgressRecorder', bar_style=bar_style, stream=stream)) return self._factory.new('DemuxRecorder', recorders) def _build_processor(self): '''Create the Processor Returns: Processor: An instance of :class:`.processor.BaseProcessor`. ''' args = self._args url_filter = self._factory.new('DemuxURLFilter', self._build_url_filters()) document_scraper = self._factory.new('DemuxDocumentScraper', self._build_document_scrapers()) file_writer = self._build_file_writer() post_data = self._get_post_data() converter = self._build_document_converter() rich_http_client = self._build_rich_http_client() phantomjs_controller = self._build_phantomjs_controller() waiter = self._factory.new('Waiter', wait=args.wait, random_wait=args.random_wait, max_wait=args.waitretry) processor = self._factory.new( 'WebProcessor', rich_http_client, url_filter=url_filter, document_scraper=document_scraper, file_writer=file_writer, waiter=waiter, retry_connrefused=args.retry_connrefused, retry_dns_error=args.retry_dns_error, statistics=self._factory['Statistics'], post_data=post_data, converter=converter, phantomjs_controller=phantomjs_controller, ) return processor def _build_file_writer(self): '''Create the File Writer. Returns: FileWriter: An instance of :class:`.writer.BaseFileWriter`. ''' args = self._args if args.delete_after: return NullWriter() use_dir = (len(args.urls) != 1 or args.page_requisites \ or args.recursive) if args.use_directories == 'force': use_dir = True elif args.use_directories == 'no': use_dir = False path_namer = self._factory.new( 'PathNamer', args.directory_prefix, index=args.default_page, use_dir=use_dir, cut=args.cut_dirs, protocol=args.protocol_directories, hostname=args.host_directories, ) if args.recursive or args.page_requisites or args.continue_download: if args.clobber_method == 'disable': file_class = OverwriteFileWriter else: file_class = IgnoreFileWriter elif args.timestamping: file_class = TimestampingFileWriter else: file_class = AntiClobberFileWriter return file_class(path_namer, file_continuing=args.continue_download, headers_included=args.save_headers, local_timestamping=args.use_server_timestamps) def _get_post_data(self): '''Return the post data.''' if self._args.post_data: return self._args.post_data elif self._args.post_file: return self._args.post_file.read() def _build_request_factory(self): '''Create the request factory. A request factory is any callable object that returns a :class:`.http.Request`. The callable must accept the same arguments to Request. Returns: A callable object ''' def request_factory(*args, **kwargs): request = self._factory.class_map['Request'].new(*args, **kwargs) user_agent = self._args.user_agent or self.default_user_agent request.fields['User-Agent'] = user_agent if self._args.referer: request.fields['Referer'] = self._args.referer for header_string in self._args.header: request.fields.parse(header_string) return request return request_factory def _build_http_client(self): '''Create the HTTP client. Returns: Client: An instance of :class:`.http.Client`. ''' args = self._args dns_timeout = args.dns_timeout connect_timeout = args.connect_timeout read_timeout = args.read_timeout if args.timeout: dns_timeout = connect_timeout = read_timeout = args.timeout if args.inet_family == 'IPv4': families = [Resolver.IPv4] elif args.inet_family == 'IPv6': families = [Resolver.IPv6] elif args.prefer_family == 'IPv6': families = [Resolver.IPv6, Resolver.IPv4] else: families = [Resolver.IPv4, Resolver.IPv6] resolver = self._factory.new( 'Resolver', families=families, timeout=dns_timeout, rotate=args.rotate_dns, cache_enabled=args.dns_cache, ) def connection_factory(*args, **kwargs): return self._factory.new('Connection', *args, resolver=resolver, connect_timeout=connect_timeout, read_timeout=read_timeout, keep_alive=self._args.http_keep_alive, ssl_options=self._build_ssl_options(), **kwargs) def host_connection_pool_factory(*args, **kwargs): return self._factory.new('HostConnectionPool', *args, connection_factory=connection_factory, **kwargs) connection_pool = self._factory.new( 'ConnectionPool', host_connection_pool_factory=host_connection_pool_factory) recorder = self._build_recorder() return self._factory.new('Client', connection_pool=connection_pool, recorder=recorder) def _build_rich_http_client(self): '''Build Rich Client.''' cookie_jar = self._build_cookie_jar() http_client = self._build_http_client() if self._args.robots: robots_txt_pool = self._factory.new('RobotsTxtPool') else: robots_txt_pool = None redirect_factory = functools.partial( self._factory.class_map['RedirectTracker'], max_redirects=self._args.max_redirect) return self._factory.new( 'RichClient', http_client, robots_txt_pool=robots_txt_pool, redirect_tracker_factory=redirect_factory, cookie_jar=cookie_jar, request_factory=self._build_request_factory(), ) def _build_cookie_jar(self): '''Build the cookie jar''' if not self._args.cookies: return if self._args.load_cookies or self._args.save_cookies: self._factory.set('CookieJar', MozillaCookieJar) cookie_jar = self._factory.new('CookieJar') if self._args.load_cookies: cookie_jar.load(self._args.load_cookies, ignore_discard=True) else: cookie_jar = self._factory.new('CookieJar') _logger.debug('Loaded cookies: {0}'.format(list(cookie_jar))) cookie_jar_wrapper = self._factory.new( 'CookieJarWrapper', cookie_jar, save_filename=self._args.save_cookies, keep_session_cookies=True, ) return cookie_jar_wrapper def _build_document_converter(self): '''Build the Document Converter.''' if not self._args.convert_links: return converter = self._factory.new('BatchDocumentConverter', self._factory['PathNamer'], self._factory['URLTable'], backup=self._args.backup_converted) return converter def _build_phantomjs_controller(self): '''Build proxy server and PhantomJS client and controller.''' if not self._args.phantomjs: return proxy_server = self._factory.new('HTTPProxyServer', self.factory['Client']) proxy_socket, proxy_port = tornado.testing.bind_unused_port() proxy_server.add_socket(proxy_socket) page_settings = {} default_headers = {'Accept-Encoding': 'identity'} if self._args.read_timeout: page_settings['resourceTimeout'] = self._args.read_timeout * 1000 page_settings['userAgent'] = self._args.user_agent \ or self.default_user_agent phantomjs_client = self._factory.new( 'PhantomJSClient', 'localhost:{0}'.format(proxy_port), page_settings=page_settings, default_headers=default_headers, ) phantomjs_client.test_client_exe() phantomjs_controller = self._factory.new( 'PhantomJSController', phantomjs_client, wait_time=self._args.phantomjs_wait, num_scrolls=self._args.phantomjs_scroll, warc_recorder=self.factory.get('WARCRecorder'), ) return phantomjs_controller def _build_ssl_options(self): '''Create the SSL options. The options must be accepted by the `ssl` module. Returns: dict ''' ssl_options = {} if self._args.check_certificate: ssl_options['cert_reqs'] = ssl.CERT_REQUIRED ssl_options['ca_certs'] = self._load_ca_certs() else: ssl_options['cert_reqs'] = ssl.CERT_NONE ssl_options['ssl_version'] = self._args.secure_protocol if self._args.certificate: ssl_options['certfile'] = self._args.certificate ssl_options['keyfile'] = self._args.private_key if self._args.edg_file: ssl.RAND_egd(self._args.edg_file) if self._args.random_file: with open(self._args.random_file, 'rb') as in_file: # Use 16KB because Wget ssl.RAND_add(in_file.read(15360), 0.0) return ssl_options def _load_ca_certs(self): '''Load the Certificate Authority certificates. Returns: A filename to the bundled CA certs. ''' if self._ca_certs_file: return self._ca_certs_file certs = set() if self._args.use_internal_ca_certs: pem_filename = os.path.join(os.path.dirname(__file__), 'cert', 'ca-bundle.pem') certs.update(self._read_pem_file(pem_filename)) if self._args.ca_directory: for filename in os.listdir(self._args.ca_directory): if os.path.isfile(filename): certs.update(self._read_pem_file(filename)) if self._args.ca_certificate: certs.update(self._read_pem_file(self._args.ca_certificate)) self._ca_certs_file = certs_filename = tempfile.mkstemp()[1] def clean_certs_file(): os.remove(certs_filename) atexit.register(clean_certs_file) with open(certs_filename, 'w+b') as certs_file: for cert in certs: certs_file.write(cert) _logger.debug('CA certs loaded.') return certs_filename def _read_pem_file(self, filename): '''Read the PEM file. Returns: iterable: An iterable of certificates. The certificate data is :class:`byte`. ''' _logger.debug('Reading PEM {0}.'.format(filename)) with open(filename, 'rb') as in_file: return wpull.util.filter_pem(in_file.read()) def _warn_unsafe_options(self): '''Print warnings about any enabled hazardous options. This function will print messages complaining about: * ``--save-headers`` ''' # TODO: Add output-document once implemented enabled_options = [] for option_name in self.UNSAFE_OPTIONS: if getattr(self._args, option_name): enabled_options.append(option_name) if enabled_options: _logger.warning( _('The following unsafe options are enabled: {list}.')\ .format(list=enabled_options) ) _logger.warning( _('The use of unsafe options may lead to unexpected behavior ' 'or file corruption.'))
class Builder(object): '''Application builder. Args: args: Options from :class:`argparse.ArgumentParser` ''' UNSAFE_OPTIONS = frozenset(['save_headers']) def __init__(self, args): self.default_user_agent = 'Mozilla/5.0 (compatible) Wpull/{0}'.format( wpull.version.__version__) self._args = args self._factory = Factory({ 'BatchDocumentConverter': BatchDocumentConverter, 'Client': Client, 'CookieJar': CookieJar, 'CookieJarWrapper': CookieJarWrapper, 'Connection': Connection, 'ConnectionPool': ConnectionPool, 'CSSScraper': CSSScraper, 'DemuxDocumentScraper': DemuxDocumentScraper, 'DemuxRecorder': DemuxRecorder, 'DemuxURLFilter': DemuxURLFilter, 'Engine': Engine, 'HostConnectionPool': HostConnectionPool, 'HTTPProxyServer': HTTPProxyServer, 'HTMLScraper': HTMLScraper, 'PathNamer': PathNamer, 'PhantomJSClient': PhantomJSClient, 'PhantomJSController': PhantomJSController, 'PrintServerResponseRecorder': PrintServerResponseRecorder, 'ProgressRecorder': ProgressRecorder, 'RedirectTracker': RedirectTracker, 'Request': Request, 'Resolver': Resolver, 'RichClient': RichClient, 'RobotsTxtPool': RobotsTxtPool, 'Statistics': Statistics, 'URLInfo': URLInfo, 'URLTable': URLTable, 'Waiter': LinearWaiter, 'WARCRecorder': WARCRecorder, 'WebProcessor': WebProcessor, }) self._url_infos = tuple(self._build_input_urls()) self._ca_certs_file = None self._file_log_handler = None self._console_log_handler = None @property def factory(self): '''Return the Factory. Returns: Factory: An :class:`.factory.Factory` instance. ''' return self._factory def build(self): '''Put the application together. Returns: Engine: An instance of :class:`.engine.Engine`. ''' self._setup_logging() self._setup_console_logger() self._setup_file_logger() self._install_script_hooks() self._warn_unsafe_options() statisics = self._factory.new('Statistics') url_table = self._build_url_table() processor = self._build_processor() engine = self._factory.new('Engine', url_table, processor, statisics, concurrent=self._args.concurrent, ) self._setup_file_logger_close(engine) self._setup_console_logger_close(engine) return engine def build_and_run(self): '''Build and run the application. Returns: int: The exit status. ''' io_loop = tornado.ioloop.IOLoop.current() engine = self.build() exit_code = io_loop.run_sync(engine) return exit_code def _new_encoded_stream(self, stream): '''Return a stream writer.''' if self._args.ascii_print: return ASCIIStreamWriter(stream) else: return stream def _setup_logging(self): '''Set up the root logger if needed. The root logger is set to DEBUG level so the file and WARC logs work correctly. ''' root_logger = logging.getLogger() root_logger.setLevel(logging.DEBUG) root_logger.debug('Wpull needs the root logger level set to DEBUG.') def _setup_console_logger(self): '''Set up the console logger. A handler and with a formatter is added to the root logger. ''' if self._args.verbosity == logging.DEBUG: tornado.ioloop.IOLoop.current().set_blocking_log_threshold(5) stream = self._new_encoded_stream(sys.stderr) logger = logging.getLogger() self._console_log_handler = handler = logging.StreamHandler(stream) formatter = logging.Formatter('%(levelname)s %(message)s') handler.setFormatter(formatter) handler.setLevel(self._args.verbosity or logging.INFO) logger.addHandler(handler) def _setup_console_logger_close(self, engine): '''Add routine to remove log handler when the engine stops.''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._console_log_handler) self._console_log_handler = None if self._console_log_handler: engine.stop_event.handle(remove_handler) def _setup_file_logger(self): '''Set up the file message logger. A file log handler and with a formatter is added to the root logger. ''' args = self._args if not (args.output_file or args.append_output): return logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.output_file: filename = args.output_file mode = 'w' else: filename = args.append_output mode = 'a' self._file_log_handler = handler = logging.FileHandler( filename, mode, encoding='utf-8') handler.setFormatter(formatter) logger.addHandler(handler) if args.verbosity == logging.DEBUG: handler.setLevel(logging.DEBUG) else: handler.setLevel(logging.INFO) def _setup_file_logger_close(self, engine): '''Add routine that removes the file log handler when the engine stops. ''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._file_log_handler) self._file_log_handler = None if self._file_log_handler: engine.stop_event.handle(remove_handler) def _install_script_hooks(self): '''Set up the scripts if any.''' if self._args.python_script: self._install_python_script(self._args.python_script) elif self._args.lua_script: self._install_lua_script(self._args.lua_script) def _install_python_script(self, filename): '''Load the Python script into an environment.''' _logger.info(_('Using Python hook script {filename}.').format( filename=filename)) hook_environment = HookEnvironment() self._setup_hook_environment(hook_environment) with open(filename, 'rb') as in_file: code = compile(in_file.read(), filename, 'exec') context = {'wpull_hook': hook_environment} exec(code, context, context) def _install_lua_script(self, filename): '''Load the Lua script into an environment.''' _logger.info(_('Using Lua hook script {filename}.').format( filename=filename)) lua = wpull.hook.load_lua() hook_environment = HookEnvironment(is_lua=True) self._setup_hook_environment(hook_environment) lua_globals = lua.globals() lua_globals.wpull_hook = hook_environment with open(filename, 'rb') as in_file: lua.execute(in_file.read()) def _setup_hook_environment(self, hook_environment): '''Override the classes needed for script hooks. Args: hook_environment: A :class:`.hook.HookEnvironment` instance ''' self._factory.set('Engine', hook_environment.engine_factory) self._factory.set('WebProcessor', hook_environment.web_processor_factory) self._factory.set('Resolver', hook_environment.resolver_factory) def _build_input_urls(self, default_scheme='http'): '''Read the URLs provided by the user.''' if self._args.input_file: urls = wpull.util.to_str(tuple([ line.strip() for line in self._args.input_file if line.strip() ])) if not urls: raise ValueError(_('No URLs found in input file.')) url_string_iter = itertools.chain( urls, self._args.input_file) else: url_string_iter = self._args.urls for url_string in url_string_iter: url_info = self._factory.class_map['URLInfo'].parse( url_string, default_scheme=default_scheme) _logger.debug('Parsed URL {0}'.format(url_info)) yield url_info def _build_url_filters(self): '''Create the URL filter instances. Returns: A list of URL filter instances ''' args = self._args filters = [ HTTPFilter(), BackwardDomainFilter(args.domains, args.exclude_domains), HostnameFilter(args.hostnames, args.exclude_hostnames), TriesFilter(args.tries), RecursiveFilter(args.recursive, args.page_requisites), LevelFilter(args.level), SpanHostsFilter( self._url_infos, enabled=not args.recursive or args.span_hosts ), RegexFilter(args.accept_regex, args.reject_regex), DirectoryFilter(args.include_directories, args.exclude_directories), BackwardFilenameFilter(args.accept, args.reject), ] if args.no_parent: filters.append(ParentFilter()) return filters def _build_document_scrapers(self): '''Create the document scrapers. Returns: A list of document scrapers ''' scrapers = [ self._factory.new( 'HTMLScraper', followed_tags=self._args.follow_tags, ignored_tags=self._args.ignore_tags, only_relative=self._args.relative, robots=self._args.robots, ), self._factory.new('CSSScraper'), ] return scrapers def _build_url_table(self): '''Create the URL table. Returns: URLTable: An instance of :class:`.database.BaseURLTable`. ''' url_table = self._factory.new('URLTable', path=self._args.database) url_table.add([url_info.url for url_info in self._url_infos]) return url_table def _build_recorder(self): '''Create the Recorder. Returns: DemuxRecorder: An instance of :class:`.recorder.DemuxRecorder`. ''' args = self._args recorders = [] if args.warc_file: if args.no_warc_compression: warc_path = args.warc_file + '.warc' else: warc_path = args.warc_file + '.warc.gz' if args.warc_cdx: cdx_path = args.warc_file + '.cdx' else: cdx_path = None extra_fields = [ ('robots', 'on' if args.robots else 'off'), ('wpull-arguments', str(args)) ] for header_string in args.warc_header: name, value = header_string.split(':', 1) name = name.strip() value = value.strip() extra_fields.append((name, value)) recorders.append( self._factory.new('WARCRecorder', warc_path, compress=not args.no_warc_compression, extra_fields=extra_fields, temp_dir=args.warc_tempdir, log=args.warc_log, appending=args.warc_append, digests=args.warc_digests, cdx_filename=cdx_path, ) ) if args.server_response: recorders.append(self._factory.new('PrintServerResponseRecorder')) if args.verbosity in (logging.INFO, logging.DEBUG, logging.WARN, None): stream = self._new_encoded_stream(sys.stderr) bar_style = args.progress == 'bar' if not stream.isatty(): bar_style = False recorders.append(self._factory.new('ProgressRecorder', bar_style=bar_style, stream=stream)) return self._factory.new('DemuxRecorder', recorders) def _build_processor(self): '''Create the Processor Returns: Processor: An instance of :class:`.processor.BaseProcessor`. ''' args = self._args url_filter = self._factory.new('DemuxURLFilter', self._build_url_filters()) document_scraper = self._factory.new('DemuxDocumentScraper', self._build_document_scrapers()) file_writer = self._build_file_writer() post_data = self._get_post_data() converter = self._build_document_converter() rich_http_client = self._build_rich_http_client() phantomjs_controller = self._build_phantomjs_controller() waiter = self._factory.new('Waiter', wait=args.wait, random_wait=args.random_wait, max_wait=args.waitretry ) processor = self._factory.new('WebProcessor', rich_http_client, url_filter=url_filter, document_scraper=document_scraper, file_writer=file_writer, waiter=waiter, retry_connrefused=args.retry_connrefused, retry_dns_error=args.retry_dns_error, statistics=self._factory['Statistics'], post_data=post_data, converter=converter, phantomjs_controller=phantomjs_controller, ) return processor def _build_file_writer(self): '''Create the File Writer. Returns: FileWriter: An instance of :class:`.writer.BaseFileWriter`. ''' args = self._args if args.delete_after: return NullWriter() use_dir = (len(args.urls) != 1 or args.page_requisites \ or args.recursive) if args.use_directories == 'force': use_dir = True elif args.use_directories == 'no': use_dir = False path_namer = self._factory.new('PathNamer', args.directory_prefix, index=args.default_page, use_dir=use_dir, cut=args.cut_dirs, protocol=args.protocol_directories, hostname=args.host_directories, ) if args.recursive or args.page_requisites or args.continue_download: if args.clobber_method == 'disable': file_class = OverwriteFileWriter else: file_class = IgnoreFileWriter elif args.timestamping: file_class = TimestampingFileWriter else: file_class = AntiClobberFileWriter return file_class( path_namer, file_continuing=args.continue_download, headers_included=args.save_headers, local_timestamping=args.use_server_timestamps ) def _get_post_data(self): '''Return the post data.''' if self._args.post_data: return self._args.post_data elif self._args.post_file: return self._args.post_file.read() def _build_request_factory(self): '''Create the request factory. A request factory is any callable object that returns a :class:`.http.Request`. The callable must accept the same arguments to Request. Returns: A callable object ''' def request_factory(*args, **kwargs): request = self._factory.class_map['Request'].new(*args, **kwargs) user_agent = self._args.user_agent or self.default_user_agent request.fields['User-Agent'] = user_agent if self._args.referer: request.fields['Referer'] = self._args.referer for header_string in self._args.header: request.fields.parse(header_string) return request return request_factory def _build_http_client(self): '''Create the HTTP client. Returns: Client: An instance of :class:`.http.Client`. ''' args = self._args dns_timeout = args.dns_timeout connect_timeout = args.connect_timeout read_timeout = args.read_timeout if args.timeout: dns_timeout = connect_timeout = read_timeout = args.timeout if args.inet_family == 'IPv4': families = [Resolver.IPv4] elif args.inet_family == 'IPv6': families = [Resolver.IPv6] elif args.prefer_family == 'IPv6': families = [Resolver.IPv6, Resolver.IPv4] else: families = [Resolver.IPv4, Resolver.IPv6] resolver = self._factory.new('Resolver', families=families, timeout=dns_timeout, rotate=args.rotate_dns, cache_enabled=args.dns_cache, ) def connection_factory(*args, **kwargs): return self._factory.new('Connection', *args, resolver=resolver, connect_timeout=connect_timeout, read_timeout=read_timeout, keep_alive=self._args.http_keep_alive, ssl_options=self._build_ssl_options(), **kwargs) def host_connection_pool_factory(*args, **kwargs): return self._factory.new('HostConnectionPool', *args, connection_factory=connection_factory, **kwargs) connection_pool = self._factory.new('ConnectionPool', host_connection_pool_factory=host_connection_pool_factory) recorder = self._build_recorder() return self._factory.new('Client', connection_pool=connection_pool, recorder=recorder) def _build_rich_http_client(self): '''Build Rich Client.''' cookie_jar = self._build_cookie_jar() http_client = self._build_http_client() if self._args.robots: robots_txt_pool = self._factory.new('RobotsTxtPool') else: robots_txt_pool = None redirect_factory = functools.partial( self._factory.class_map['RedirectTracker'], max_redirects=self._args.max_redirect ) return self._factory.new( 'RichClient', http_client, robots_txt_pool=robots_txt_pool, redirect_tracker_factory=redirect_factory, cookie_jar=cookie_jar, request_factory=self._build_request_factory(), ) def _build_cookie_jar(self): '''Build the cookie jar''' if not self._args.cookies: return if self._args.load_cookies or self._args.save_cookies: self._factory.set('CookieJar', MozillaCookieJar) cookie_jar = self._factory.new('CookieJar') if self._args.load_cookies: cookie_jar.load(self._args.load_cookies, ignore_discard=True) else: cookie_jar = self._factory.new('CookieJar') _logger.debug('Loaded cookies: {0}'.format(list(cookie_jar))) cookie_jar_wrapper = self._factory.new( 'CookieJarWrapper', cookie_jar, save_filename=self._args.save_cookies, keep_session_cookies=True, ) return cookie_jar_wrapper def _build_document_converter(self): '''Build the Document Converter.''' if not self._args.convert_links: return converter = self._factory.new( 'BatchDocumentConverter', self._factory['PathNamer'], self._factory['URLTable'], backup=self._args.backup_converted ) return converter def _build_phantomjs_controller(self): '''Build proxy server and PhantomJS client and controller.''' if not self._args.phantomjs: return proxy_server = self._factory.new( 'HTTPProxyServer', self.factory['Client'] ) proxy_socket, proxy_port = tornado.testing.bind_unused_port() proxy_server.add_socket(proxy_socket) page_settings = {} default_headers = {'Accept-Encoding': 'identity'} if self._args.read_timeout: page_settings['resourceTimeout'] = self._args.read_timeout * 1000 page_settings['userAgent'] = self._args.user_agent \ or self.default_user_agent phantomjs_client = self._factory.new( 'PhantomJSClient', 'localhost:{0}'.format(proxy_port), page_settings=page_settings, default_headers=default_headers, ) phantomjs_client.test_client_exe() phantomjs_controller = self._factory.new( 'PhantomJSController', phantomjs_client, wait_time=self._args.phantomjs_wait, num_scrolls=self._args.phantomjs_scroll, warc_recorder=self.factory.get('WARCRecorder'), ) return phantomjs_controller def _build_ssl_options(self): '''Create the SSL options. The options must be accepted by the `ssl` module. Returns: dict ''' ssl_options = {} if self._args.check_certificate: ssl_options['cert_reqs'] = ssl.CERT_REQUIRED ssl_options['ca_certs'] = self._load_ca_certs() else: ssl_options['cert_reqs'] = ssl.CERT_NONE ssl_options['ssl_version'] = self._args.secure_protocol if self._args.certificate: ssl_options['certfile'] = self._args.certificate ssl_options['keyfile'] = self._args.private_key if self._args.edg_file: ssl.RAND_egd(self._args.edg_file) if self._args.random_file: with open(self._args.random_file, 'rb') as in_file: # Use 16KB because Wget ssl.RAND_add(in_file.read(15360), 0.0) return ssl_options def _load_ca_certs(self): '''Load the Certificate Authority certificates. Returns: A filename to the bundled CA certs. ''' if self._ca_certs_file: return self._ca_certs_file certs = set() if self._args.use_internal_ca_certs: pem_filename = os.path.join(os.path.dirname(__file__), 'cert', 'ca-bundle.pem') certs.update(self._read_pem_file(pem_filename)) if self._args.ca_directory: for filename in os.listdir(self._args.ca_directory): if os.path.isfile(filename): certs.update(self._read_pem_file(filename)) if self._args.ca_certificate: certs.update(self._read_pem_file(self._args.ca_certificate)) self._ca_certs_file = certs_filename = tempfile.mkstemp()[1] def clean_certs_file(): os.remove(certs_filename) atexit.register(clean_certs_file) with open(certs_filename, 'w+b') as certs_file: for cert in certs: certs_file.write(cert) _logger.debug('CA certs loaded.') return certs_filename def _read_pem_file(self, filename): '''Read the PEM file. Returns: iterable: An iterable of certificates. The certificate data is :class:`byte`. ''' _logger.debug('Reading PEM {0}.'.format(filename)) with open(filename, 'rb') as in_file: return wpull.util.filter_pem(in_file.read()) def _warn_unsafe_options(self): '''Print warnings about any enabled hazardous options. This function will print messages complaining about: * ``--save-headers`` ''' # TODO: Add output-document once implemented enabled_options = [] for option_name in self.UNSAFE_OPTIONS: if getattr(self._args, option_name): enabled_options.append(option_name) if enabled_options: _logger.warning( _('The following unsafe options are enabled: {list}.')\ .format(list=enabled_options) ) _logger.warning( _('The use of unsafe options may lead to unexpected behavior ' 'or file corruption.'))
class Builder(object): '''Application builder. Args: args: Options from :class:`argparse.ArgumentParser` ''' UNSAFE_OPTIONS = frozenset(['save_headers', 'no_iri', 'output_document']) def __init__(self, args): self.default_user_agent = 'Wpull/{0} (gzip)'.format( wpull.version.__version__) self._args = args self._factory = Factory({ 'BatchDocumentConverter': BatchDocumentConverter, 'Client': Client, 'CookieJar': CookieJar, 'CookieJarWrapper': CookieJarWrapper, 'CookiePolicy': CookieLimitsPolicy, 'Connection': Connection, 'ConnectionPool': ConnectionPool, 'CSSScraper': CSSScraper, 'DemuxDocumentScraper': DemuxDocumentScraper, 'DemuxRecorder': DemuxRecorder, 'DemuxURLFilter': DemuxURLFilter, 'Engine': Engine, 'HostConnectionPool': HostConnectionPool, 'HTTPProxyServer': HTTPProxyServer, 'HTMLScraper': HTMLScraper, 'JavaScriptScraper': JavaScriptScraper, 'OutputDocumentRecorder': OutputDocumentRecorder, 'PathNamer': PathNamer, 'PhantomJSClient': PhantomJSClient, 'PhantomJSController': PhantomJSController, 'PrintServerResponseRecorder': PrintServerResponseRecorder, 'ProgressRecorder': ProgressRecorder, 'RedirectTracker': RedirectTracker, 'Request': Request, 'Resolver': Resolver, 'RichClient': RichClient, 'RobotsTxtPool': RobotsTxtPool, 'SitemapScraper': SitemapScraper, 'Statistics': Statistics, 'URLInfo': URLInfo, 'URLTable': URLTable, 'Waiter': LinearWaiter, 'WARCRecorder': WARCRecorder, 'WebProcessor': WebProcessor, 'WebProcessorFetchParams': WebProcessorFetchParams, 'WebProcessorInstances': WebProcessorInstances, }) self._url_infos = tuple(self._build_input_urls()) self._ca_certs_file = None self._file_log_handler = None self._console_log_handler = None @property def factory(self): '''Return the Factory. Returns: Factory: An :class:`.factory.Factory` instance. ''' return self._factory def build(self): '''Put the application together. Returns: Engine: An instance of :class:`.engine.Engine`. ''' self._setup_logging() self._setup_console_logger() self._setup_file_logger() self._install_script_hooks() self._warn_unsafe_options() self._warn_silly_options() statistics = self._factory.new('Statistics') statistics.quota = self._args.quota statistics.required_url_infos.update(self._url_infos) url_table = self._build_url_table() processor = self._build_processor() engine = self._factory.new( 'Engine', url_table, processor, statistics, concurrent=self._args.concurrent, ) self._setup_file_logger_close(engine) self._setup_console_logger_close(engine) return engine def build_and_run(self): '''Build and run the application. Returns: int: The exit status. ''' io_loop = tornado.ioloop.IOLoop.current() engine = self.build() exit_code = io_loop.run_sync(engine) return exit_code def _new_encoded_stream(self, stream): '''Return a stream writer.''' if self._args.ascii_print: return ASCIIStreamWriter(stream) else: return stream def _setup_logging(self): '''Set up the root logger if needed. The root logger is set the appropriate level so the file and WARC logs work correctly. ''' assert (logging.CRITICAL > logging.ERROR > logging.WARNING > logging.INFO > logging.DEBUG > logging.NOTSET) assert self._args.verbosity root_logger = logging.getLogger() current_level = root_logger.getEffectiveLevel() min_level = logging.ERROR if self._args.verbosity == logging.WARNING: min_level = logging.WARNING if self._args.verbosity == logging.INFO \ or self._args.warc_file \ or self._args.output_file or self._args.append_output: min_level = logging.INFO if self._args.verbosity == logging.DEBUG: min_level = logging.DEBUG if current_level > min_level: root_logger.setLevel(min_level) root_logger.debug( 'Wpull needs the root logger level set to {0}.'\ .format(min_level) ) def _setup_console_logger(self): '''Set up the console logger. A handler and with a formatter is added to the root logger. ''' stream = self._new_encoded_stream(sys.stderr) logger = logging.getLogger() self._console_log_handler = handler = logging.StreamHandler(stream) formatter = logging.Formatter('%(levelname)s %(message)s') handler.setFormatter(formatter) handler.setLevel(self._args.verbosity or logging.INFO) logger.addHandler(handler) def _setup_console_logger_close(self, engine): '''Add routine to remove log handler when the engine stops.''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._console_log_handler) self._console_log_handler = None if self._console_log_handler: engine.stop_event.handle(remove_handler) def _setup_file_logger(self): '''Set up the file message logger. A file log handler and with a formatter is added to the root logger. ''' args = self._args if not (args.output_file or args.append_output): return logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.output_file: filename = args.output_file mode = 'w' else: filename = args.append_output mode = 'a' self._file_log_handler = handler = logging.FileHandler( filename, mode, encoding='utf-8') handler.setFormatter(formatter) logger.addHandler(handler) if args.verbosity == logging.DEBUG: handler.setLevel(logging.DEBUG) else: handler.setLevel(logging.INFO) def _setup_file_logger_close(self, engine): '''Add routine that removes the file log handler when the engine stops. ''' def remove_handler(): logger = logging.getLogger() logger.removeHandler(self._file_log_handler) self._file_log_handler = None if self._file_log_handler: engine.stop_event.handle(remove_handler) def _install_script_hooks(self): '''Set up the scripts if any.''' if self._args.python_script: self._install_python_script(self._args.python_script) elif self._args.lua_script: self._install_lua_script(self._args.lua_script) def _install_python_script(self, filename): '''Load the Python script into an environment.''' _logger.info( _('Using Python hook script {filename}.').format( filename=filename)) hook_environment = HookEnvironment(self._factory) self._setup_hook_environment(hook_environment) with open(filename, 'rb') as in_file: code = compile(in_file.read(), filename, 'exec') context = {'wpull_hook': hook_environment} exec(code, context, context) def _install_lua_script(self, filename): '''Load the Lua script into an environment.''' _logger.info( _('Using Lua hook script {filename}.').format(filename=filename)) lua = wpull.hook.load_lua() hook_environment = HookEnvironment(self._factory, is_lua=True) self._setup_hook_environment(hook_environment) lua_globals = lua.globals() lua_globals.wpull_hook = hook_environment with open(filename, 'rb') as in_file: lua.execute(in_file.read()) def _setup_hook_environment(self, hook_environment): '''Override the classes needed for script hooks. Args: hook_environment: A :class:`.hook.HookEnvironment` instance ''' self._factory.set('Engine', hook_environment.engine_factory) self._factory.set('WebProcessor', hook_environment.web_processor_factory) self._factory.set('Resolver', hook_environment.resolver_factory) def _build_input_urls(self, default_scheme='http'): '''Read the URLs provided by the user.''' url_string_iter = self._args.urls or () if self._args.input_file: if self._args.force_html: urls = self._read_input_file_as_html() else: urls = self._read_input_file_as_lines() url_string_iter = itertools.chain(url_string_iter, urls) sitemap_url_infos = set() base_url = self._args.base for url_string in url_string_iter: _logger.debug('Parsing URL {0}'.format(url_string)) if base_url: url_string = wpull.url.urljoin(base_url, url_string) url_info = self._factory.class_map['URLInfo'].parse( url_string, default_scheme=default_scheme) _logger.debug('Parsed URL {0}'.format(url_info)) yield url_info if self._args.sitemaps: sitemap_url_infos.update( (URLInfo.parse('{0}://{1}/robots.txt'.format( url_info.scheme, url_info.hostname_with_port)), URLInfo.parse('{0}://{1}/sitemap.xml'.format( url_info.scheme, url_info.hostname_with_port)))) for url_info in sitemap_url_infos: yield url_info def _read_input_file_as_lines(self): '''Read lines from input file and return them.''' input_file = codecs.getreader(self._args.local_encoding or 'utf-8')(self._args.input_file) urls = [line.strip() for line in input_file if line.strip()] if not urls: raise ValueError(_('No URLs found in input file.')) return urls def _read_input_file_as_html(self): '''Read input file as HTML and return the links.''' scrape_info = HTMLScraper.scrape_file( self._args.input_file, encoding=self._args.local_encoding or 'utf-8') links = itertools.chain(scrape_info['inline_urls'], scrape_info['linked_urls']) return links def _build_url_filters(self): '''Create the URL filter instances. Returns: A list of URL filter instances ''' args = self._args filters = [ HTTPSOnlyFilter() if args.https_only else HTTPFilter(), RecursiveFilter(enabled=args.recursive, page_requisites=args.page_requisites), SpanHostsFilter( self._url_infos, enabled=args.span_hosts, page_requisites='page-requisites' in args.span_hosts_allow, linked_pages='linked-pages' in args.span_hosts_allow, ), ] if args.no_parent: filters.append(ParentFilter()) if args.domains or args.exclude_domains: filters.append( BackwardDomainFilter(args.domains, args.exclude_domains)) if args.hostnames or args.exclude_hostnames: filters.append( HostnameFilter(args.hostnames, args.exclude_hostnames)) if args.tries: filters.append(TriesFilter(args.tries)) if args.level and args.recursive: filters.append(LevelFilter(args.level)) if args.accept_regex or args.reject_regex: filters.append(RegexFilter(args.accept_regex, args.reject_regex)) if args.include_directories or args.exclude_directories: filters.append( DirectoryFilter(args.include_directories, args.exclude_directories)) if args.accept or args.reject: filters.append(BackwardFilenameFilter(args.accept, args.reject)) return filters def _build_document_scrapers(self): '''Create the document scrapers. Returns: A list of document scrapers ''' scrapers = [ self._factory.new( 'HTMLScraper', followed_tags=self._args.follow_tags, ignored_tags=self._args.ignore_tags, only_relative=self._args.relative, robots=self._args.robots, encoding_override=self._args.remote_encoding, ), self._factory.new( 'CSSScraper', encoding_override=self._args.remote_encoding, ), self._factory.new( 'JavaScriptScraper', encoding_override=self._args.remote_encoding, ), ] if self._args.sitemaps: scrapers.append( self._factory.new( 'SitemapScraper', encoding_override=self._args.remote_encoding, )) return scrapers def _build_url_table(self): '''Create the URL table. Returns: URLTable: An instance of :class:`.database.BaseURLTable`. ''' url_table = self._factory.new('URLTable', path=self._args.database) url_table.add([url_info.url for url_info in self._url_infos]) return url_table def _build_recorder(self): '''Create the Recorder. Returns: DemuxRecorder: An instance of :class:`.recorder.DemuxRecorder`. ''' args = self._args recorders = [] if args.warc_file: extra_fields = [('robots', 'on' if args.robots else 'off'), ('wpull-arguments', str(args))] for header_string in args.warc_header: name, value = header_string.split(':', 1) name = name.strip() value = value.strip() extra_fields.append((name, value)) software_string = WARCRecorder.DEFAULT_SOFTWARE_STRING if args.phantomjs: software_string += ' PhantomJS/{0}'.format( wpull.phantomjs.get_version()) recorders.append( self._factory.new( 'WARCRecorder', args.warc_file, params=WARCRecorderParams( compress=not args.no_warc_compression, extra_fields=extra_fields, temp_dir=args.warc_tempdir, log=args.warc_log, appending=args.warc_append, digests=args.warc_digests, cdx=args.warc_cdx, max_size=args.warc_max_size, url_table=self._factory['URLTable'] if args.warc_dedup else None, software_string=software_string, ), )) if args.server_response: recorders.append(self._factory.new('PrintServerResponseRecorder')) assert args.verbosity if args.verbosity in (logging.INFO, logging.DEBUG, logging.WARNING): stream = self._new_encoded_stream(sys.stderr) bar_style = args.progress == 'bar' if not stream.isatty(): bar_style = False recorders.append( self._factory.new('ProgressRecorder', bar_style=bar_style, stream=stream)) if args.warc_dedup: self._populate_visits() if args.output_document: recorders.append( self._factory.new( 'OutputDocumentRecorder', args.output_document, with_headers=args.save_headers, )) return self._factory.new('DemuxRecorder', recorders) def _populate_visits(self): '''Populate the visits from the CDX into the URL table.''' iterable = wpull.warc.read_cdx(self._args.warc_dedup, encoding=self._args.local_encoding or 'utf-8') missing_url_msg = _('The URL ("a") is missing from the CDX file.') missing_id_msg = _('The record ID ("u") is missing from the CDX file.') missing_checksum_msg = \ _('The SHA1 checksum ("k") is missing from the CDX file.') nonlocal_var = {'counter': 0} def visits(): checked_fields = False for record in iterable: if not checked_fields: if 'a' not in record: raise ValueError(missing_url_msg) if 'u' not in record: raise ValueError(missing_id_msg) if 'k' not in record: raise ValueError(missing_checksum_msg) checked_fields = True yield record['a'], record['u'], record['k'] nonlocal_var['counter'] += 1 url_table = self.factory['URLTable'] url_table.add_visits(visits()) _logger.info( gettext.ngettext( 'Loaded {num} record from CDX file.', 'Loaded {num} records from CDX file.', nonlocal_var['counter']).format(num=nonlocal_var['counter'])) def _build_processor(self): '''Create the Processor Returns: Processor: An instance of :class:`.processor.BaseProcessor`. ''' args = self._args url_filter = self._factory.new('DemuxURLFilter', self._build_url_filters()) document_scraper = self._factory.new('DemuxDocumentScraper', self._build_document_scrapers()) file_writer = self._build_file_writer() post_data = self._get_post_data() converter = self._build_document_converter() rich_http_client = self._build_rich_http_client() phantomjs_controller = self._build_phantomjs_controller() waiter = self._factory.new('Waiter', wait=args.wait, random_wait=args.random_wait, max_wait=args.waitretry) web_processor_instances = self._factory.new( 'WebProcessorInstances', url_filter=url_filter, document_scraper=document_scraper, file_writer=file_writer, waiter=waiter, statistics=self._factory['Statistics'], converter=converter, phantomjs_controller=phantomjs_controller, ) web_processor_fetch_params = self._factory.new( 'WebProcessorFetchParams', retry_connrefused=args.retry_connrefused, retry_dns_error=args.retry_dns_error, post_data=post_data, strong_robots=args.strong_robots, strong_redirects=args.strong_redirects, content_on_error=args.content_on_error, ) processor = self._factory.new('WebProcessor', rich_http_client, args.directory_prefix, web_processor_fetch_params, web_processor_instances) return processor def _build_file_writer(self): '''Create the File Writer. Returns: FileWriter: An instance of :class:`.writer.BaseFileWriter`. ''' args = self._args if args.delete_after or args.output_document: return NullWriter() use_dir = (len(args.urls) != 1 or args.page_requisites \ or args.recursive) if args.use_directories == 'force': use_dir = True elif args.use_directories == 'no': use_dir = False os_type = 'windows' if 'windows' in args.restrict_file_names \ else 'unix' ascii_only = 'ascii' in args.restrict_file_names no_control = 'nocontrol' not in args.restrict_file_names if 'lower' in args.restrict_file_names: case = 'lower' elif 'upper' in args.restrict_file_names: case = 'upper'