def __init__(self, source, **kwargs): super(RemoteCDXServer, self).__init__(**kwargs) if isinstance(source, RemoteCDXSource): self.source = source elif (isinstance(source, str) and is_http(source)): self.source = RemoteCDXSource(source, remote_processing=True) else: raise Exception('Invalid remote cdx source: ' + str(source))
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, proxies=None): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if is_http(url): (status_headers, stream) = self.fetch_http(url, env, req_headers, follow_redirects, proxies) else: (status_headers, stream) = self.fetch_local_file(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if timestamp is None: timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) cdx = { 'urlkey': urlkey, 'timestamp': timestamp, 'original': url, 'statuscode': status_headers.get_statuscode(), 'mimetype': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter.rewrite_content( urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, proxies=None): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if is_http(url): (status_headers, stream) = self.fetch_http(url, env, req_headers, follow_redirects, proxies) else: (status_headers, stream) = self.fetch_local_file(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if timestamp is None: timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) cdx = {'urlkey': urlkey, 'timestamp': timestamp, 'original': url, 'statuscode': status_headers.get_statuscode(), 'mimetype': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter. rewrite_content(urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def _create_cdx_source(self, filename, config): if is_http(filename): return RemoteCDXSource(filename) if filename.startswith('redis://'): return RedisCDXSource(filename, config) if filename.endswith('.cdx'): return CDXFile(filename) if filename.endswith(('.summary', '.idx')): return ZipNumCluster(filename, config) logging.warn('skipping unrecognized URI:%s', filename) return None
def _create_cdx_source(self, filename, config): if is_http(filename): return RemoteCDXSource(filename) if filename.startswith('redis://'): return RedisCDXSource(filename, config) if filename.endswith(('.cdx', '.cdxj')): return CDXFile(filename) if filename.endswith(('.summary', '.idx')): return ZipNumCluster(filename, config) logging.warn('skipping unrecognized URI:%s', filename) return None
def _add_dir_if_exists(self, coll, root_dir, dir_key, required=False): curr_val = coll.get(dir_key) if curr_val: if not is_http(curr_val): coll[dir_key] = self._norm_path(root_dir, curr_val) + os.path.sep return False thedir = self.config.get('paths')[dir_key] fulldir = os.path.join(root_dir, thedir) if os.path.isdir(fulldir): fulldir = os.path.abspath(fulldir) + os.path.sep coll[dir_key] = fulldir return True elif required: msg = 'Dir "{0}" does not exist for "{1}"'.format(fulldir, dir_key) raise Exception(msg) else: return False
def create_cdx_server(config, ds_rules_file=None, server_cls=None): if hasattr(config, 'get'): paths = config.get('index_paths') surt_ordered = config.get('surt_ordered', True) pass_config = config else: paths = config surt_ordered = True pass_config = None logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) if not server_cls: if ((isinstance(paths, str) and is_http(paths)) or isinstance(paths, RemoteCDXSource)): server_cls = RemoteCDXServer else: server_cls = CDXServer return server_cls(paths, config=pass_config, surt_ordered=surt_ordered, ds_rules_file=ds_rules_file)