def handle_exception(self, env, exc, print_trace): error_view = None if hasattr(self.wb_router, 'error_view'): error_view = self.wb_router.error_view if hasattr(exc, 'status'): if callable(exc.status): status = exc.status() else: status = exc.status # wsgi requires status # - to have at least 4 characters and # - to start with a number / integer if type(status) == int: status = '{} Exception {}'.format(status, type(exc).__name__) elif type(status) == str and status[0].isdigit(): pass else: status = '500 Internal Server Error' else: status = '500 Internal Server Error' if hasattr(exc, 'url'): err_url = exc.url else: err_url = None if len(exc.args): err_msg = str(exc.args[0]) if print_trace: import traceback err_details = traceback.format_exc() print(err_details) else: logging.info(err_msg) err_details = None if error_view: if err_url and isinstance(err_url, str): err_url = to_native_str(err_url, 'utf-8') if err_msg and isinstance(err_msg, str): err_msg = to_native_str(err_msg, 'utf-8') return error_view.render_response(exc_type=type(exc).__name__, err_msg=err_msg, err_details=err_details, status=status, env=env, err_url=err_url) else: msg = status + ' Error: ' if err_msg: msg += err_msg #msg = msg.encode('utf-8', 'ignore') return WbResponse.text_response(msg, status=status)
def __init__(self, cdxline=b''): OrderedDict.__init__(self) cdxline = cdxline.rstrip() self._from_json = False self._cached_json = None # Allows for filling the fields later or in a custom way if not cdxline: self.cdxline = cdxline return fields = cdxline.split(b' ' , 2) # Check for CDX JSON if fields[-1].startswith(b'{'): self[URLKEY] = to_native_str(fields[0], 'utf-8') self[TIMESTAMP] = to_native_str(fields[1], 'utf-8') json_fields = json_decode(to_native_str(fields[-1], 'utf-8')) for n, v in six.iteritems(json_fields): n = to_native_str(n, 'utf-8') n = self.CDX_ALT_FIELDS.get(n, n) if n == 'url': try: v.encode('ascii') except UnicodeEncodeError: v = quote(v.encode('utf-8'), safe=':/') if n != 'filename': v = to_native_str(v, 'utf-8') self[n] = v self.cdxline = cdxline self._from_json = True return more_fields = fields.pop().split(b' ') fields.extend(more_fields) cdxformat = None for i in self.CDX_FORMATS: if len(i) == len(fields): cdxformat = i if not cdxformat: msg = 'unknown {0}-field cdx format'.format(len(fields)) raise CDXException(msg) for header, field in zip(cdxformat, fields): self[header] = field.decode('utf-8') self.cdxline = cdxline
def __init__(self, cdxline=b''): OrderedDict.__init__(self) cdxline = cdxline.rstrip() self._from_json = False self._cached_json = None # Allows for filling the fields later or in a custom way if not cdxline: self.cdxline = cdxline return fields = cdxline.split(b' ', 2) # Check for CDX JSON if fields[-1].startswith(b'{'): self[URLKEY] = to_native_str(fields[0], 'utf-8') self[TIMESTAMP] = to_native_str(fields[1], 'utf-8') json_fields = json_decode(to_native_str(fields[-1], 'utf-8')) for n, v in six.iteritems(json_fields): n = to_native_str(n, 'utf-8') n = self.CDX_ALT_FIELDS.get(n, n) if n == 'url': try: v.encode('ascii') except UnicodeEncodeError: v = quote(v.encode('utf-8'), safe=':/') if n != 'filename': v = to_native_str(v, 'utf-8') self[n] = v self.cdxline = cdxline self._from_json = True return more_fields = fields.pop().split(b' ') fields.extend(more_fields) cdxformat = None for i in self.CDX_FORMATS: if len(i) == len(fields): cdxformat = i if not cdxformat: msg = 'unknown {0}-field cdx format'.format(len(fields)) raise CDXException(msg) for header, field in zip(cdxformat, fields): self[header] = field.decode('utf-8') self.cdxline = cdxline
def create_renew_sesh_id(self, sesh_id, force=False): #if sesh_id in self.cache and not force: if sesh_id and ((sesh_id + ':c') in self.cache) and not force: return sesh_id sesh_id = base64.b32encode(os.urandom(5)).lower() return to_native_str(sesh_id)
def create_renew_sesh_id(self, sesh_id, force=False): # if sesh_id in self.cache and not force: if sesh_id and ((sesh_id + ":c") in self.cache) and not force: return sesh_id sesh_id = base64.b32encode(os.urandom(5)).lower() return to_native_str(sesh_id)
def to_uri(url): """ Converts a url to an ascii %-encoded form where: - scheme is ascii, - host is punycode, - and remainder is %-encoded Not using urlsplit to also decode partially encoded scheme urls """ parts = WbUrl.FIRST_PATH.split(url, 1) sep = url[len(parts[0])] if len(parts) > 1 else None scheme_dom = unquote_plus(parts[0]) if six.PY2 and isinstance(scheme_dom, six.binary_type): if scheme_dom == parts[0]: return url scheme_dom = scheme_dom.decode('utf-8', 'ignore') scheme_dom = scheme_dom.rsplit('/', 1) domain = scheme_dom[-1] try: domain = to_native_str(domain.encode('idna'), 'utf-8') except UnicodeError: # the url is invalid and this is probably not a domain pass if len(scheme_dom) > 1: url = to_native_str(scheme_dom[0], 'utf-8') + '/' + domain else: url = domain if len(parts) > 1: url += sep rest = parts[1] try: rest.encode('ascii') except UnicodeEncodeError: rest = quote(to_native_str(rest, 'utf-8')) url += rest return url
def handle_exception(self, env, exc, print_trace): error_view = None if hasattr(self.wb_router, 'error_view'): error_view = self.wb_router.error_view if hasattr(exc, 'status'): status = exc.status() else: status = '500 Internal Server Error' if hasattr(exc, 'url'): err_url = exc.url else: err_url = None if len(exc.args): err_msg = exc.args[0] if print_trace: import traceback err_details = traceback.format_exc() print(err_details) else: logging.info(err_msg) err_details = None if error_view: if err_url and isinstance(err_url, str): err_url = to_native_str(err_url, 'utf-8') if err_msg and isinstance(err_msg, str): err_msg = to_native_str(err_msg, 'utf-8') return error_view.render_response(exc_type=type(exc).__name__, err_msg=err_msg, err_details=err_details, status=status, env=env, err_url=err_url) else: msg = status + ' Error: ' if err_msg: msg += err_msg #msg = msg.encode('utf-8', 'ignore') return WbResponse.text_response(msg, status=status)
def __call__(self, filename, cdx=None): with open(self.pathindex_file, 'rb') as reader: result = iter_exact(reader, filename.encode('utf-8'), b'\t') for pathline in result: paths = pathline.split(b'\t')[1:] for path in paths: yield to_native_str(path, 'utf-8')
def __str__(self): if self.cdxline: return to_native_str(self.cdxline, 'utf-8') if not self._from_json: return ' '.join(str(val) for val in six.itervalues(self)) else: return json_encode(self)
def _extract_html_charset(buff, status_headers): charset = None m = RewriteContent.CHARSET_REGEX.search(buff) if m: charset = m.group(1) charset = to_native_str(charset) # content_type = 'text/html; charset=' + charset # status_headers.replace_header('content-type', content_type) return charset
def read_basic_auth_coll(value): parts = value.split(' ') if parts[0].lower() != 'basic': return '' if len(parts) != 2: return '' user_pass = base64.b64decode(parts[1].encode('utf-8')) return to_native_str(user_pass.split(b':')[0])
def read_basic_auth_coll(value): parts = value.split(" ") if parts[0].lower() != "basic": return "" if len(parts) != 2: return "" user_pass = base64.b64decode(parts[1].encode("utf-8")) return to_native_str(user_pass.split(b":")[0])
def __call__(self, query): matched_rule = None urlkey = to_native_str(query.key, 'utf-8') url = query.url filter_ = query.filters output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue matched_rule = rule groups = m.groups() for g in groups: for f in matched_rule.filter: filter_.append(f.format(g)) break if not matched_rule: return None repl = '?' if matched_rule.replace: repl = matched_rule.replace inx = url.find(repl) if inx > 0: url = url[:inx + len(repl)] if matched_rule.match_type == 'domain': host = urlsplit(url).netloc # remove the subdomain url = host.split('.', 1)[1] params = query.params params.update({ 'url': url, 'matchType': matched_rule.match_type, 'filter': filter_ }) if 'reverse' in params: del params['reverse'] if 'closest' in params: del params['closest'] if 'end_key' in params: del params['end_key'] return params
def __call__(self, query): matched_rule = None urlkey = to_native_str(query.key, 'utf-8') url = query.url filter_ = query.filters output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue matched_rule = rule groups = m.groups() for g in groups: for f in matched_rule.filter: filter_.append(f.format(g)) break if not matched_rule: return None repl = '?' if matched_rule.replace: repl = matched_rule.replace inx = url.find(repl) if inx > 0: url = url[:inx + len(repl)] if matched_rule.match_type == 'domain': host = urlsplit(url).netloc # remove the subdomain url = host.split('.', 1)[1] params = query.params params.update({'url': url, 'matchType': matched_rule.match_type, 'filter': filter_}) if 'reverse' in params: del params['reverse'] if 'closest' in params: del params['closest'] if 'end_key' in params: del params['end_key'] return params
def parse(self, stream, headerline=None): total_read = 0 def readline(): return to_native_str(stream.readline()) # if headerline passed in, use that if headerline is None: headerline = readline() else: headerline = to_native_str(headerline) header_len = len(headerline) if header_len == 0: raise EOFError() headerline = headerline.rstrip() headernames = self.headernames # if arc header, consume next two lines if headerline.startswith('filedesc://'): version = readline() # skip version spec = readline() # skip header spec, use preset one total_read += len(version) total_read += len(spec) parts = headerline.split(' ') if len(parts) != len(headernames): msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' msg = msg.format(headernames, parts) raise StatusAndHeadersParserException(msg, parts) headers = [] for name, value in zip(headernames, parts): headers.append((name, value)) return StatusAndHeaders(statusline='', headers=headers, protocol='ARC/1.0', total_len=total_read)
def __init__(self, idxline): OrderedDict.__init__(self) idxline = idxline.rstrip() fields = idxline.split(b'\t') if len(fields) < self.NUM_REQ_FIELDS: msg = 'invalid idx format: {0} fields found, {1} required' raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) for header, field in zip(self.FORMAT, fields): self[header] = to_native_str(field, 'utf-8') self['offset'] = int(self['offset']) self['length'] = int(self['length']) lineno = self.get('lineno') if lineno: self['lineno'] = int(lineno) self.idxline = idxline
def handle_connect(self, env): sock = self.get_request_socket(env) if not sock: return WbResponse.text_response('HTTPS Proxy Not Supported', '405 HTTPS Proxy Not Supported') sock.send(b'HTTP/1.0 200 Connection Established\r\n') sock.send(b'Proxy-Connection: close\r\n') sock.send(b'Server: pywb proxy\r\n') sock.send(b'\r\n') hostname, port = env['REL_REQUEST_URI'].split(':') if not self.use_wildcard: certfile = self.ca.cert_for_host(hostname) else: certfile = self.ca.get_wildcard_cert(hostname) try: ssl_sock = ssl.wrap_socket(sock, server_side=True, certfile=certfile, #ciphers="ALL", suppress_ragged_eofs=False, ssl_version=ssl.PROTOCOL_SSLv23 ) env['pywb.proxy_ssl_sock'] = ssl_sock buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) statusline = to_native_str(buffreader.readline().rstrip()) except Exception as se: raise BadRequestException(se.message) statusparts = statusline.split(' ') if len(statusparts) < 3: raise BadRequestException('Invalid Proxy Request: ' + statusline) env['REQUEST_METHOD'] = statusparts[0] env['REL_REQUEST_URI'] = ('https://' + env['REL_REQUEST_URI'].replace(':443', '') + statusparts[1]) env['SERVER_PROTOCOL'] = statusparts[2].strip() env['pywb.proxy_scheme'] = 'https' env['pywb.proxy_host'] = hostname env['pywb.proxy_port'] = port env['pywb.proxy_req_uri'] = statusparts[1] queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' env['pywb.proxy_query'] = env['QUERY_STRING'] while True: line = to_native_str(buffreader.readline()) if line: line = line.rstrip() if not line: break parts = line.split(':', 1) if len(parts) < 2: continue name = parts[0].strip() value = parts[1].strip() name = name.replace('-', '_').upper() if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'): name = 'HTTP_' + name env[name] = value env['wsgi.input'] = buffreader
def __getitem__(self, item): return to_native_str(self.redis.hget(self.key, item), 'utf-8')
def get_rewritten(*args, **kwargs): status_headers, buff = LiveRewriter().get_rewritten(remote_only=False, *args, **kwargs) return status_headers, to_native_str(buff)
def b64encode(self, string): return to_native_str(base64.b64encode(string.encode('utf-8')))
def __call__(self, filename, cdx=None): redis_val = self.redis.hget(self.key_prefix + filename, 'path') return [to_native_str(redis_val, 'utf-8')] if redis_val else []
def _fuzzy_query_call(self, query): # imports added here for brozzler from pywb.utils.loaders import to_native_str from six.moves.urllib.parse import urlsplit, urlunsplit matched_rule = None urlkey = to_native_str(query.key, 'utf-8') url = query.url filter_ = query.filters output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue matched_rule = rule groups = m.groups() for g in groups: for f in matched_rule.filter: filter_.append(f.format(g)) break if not matched_rule: return None repl = '?' if matched_rule.replace: repl = matched_rule.replace inx = url.find(repl) if inx > 0: url = url[:inx + len(repl)] # begin brozzler changes if matched_rule.match_type == 'domain': orig_split_url = urlsplit(url) # remove the subdomain, path, query and fragment host = orig_split_url.netloc.split('.', 1)[1] new_split_url = (orig_split_url.scheme, host, '', '', '') url = urlunsplit(new_split_url) # end brozzler changes params = query.params params.update({'url': url, 'matchType': matched_rule.match_type, 'filter': filter_}) if 'reverse' in params: del params['reverse'] if 'closest' in params: del params['closest'] if 'end_key' in params: del params['end_key'] return params
def __str__(self): return to_native_str(self.idxline, 'utf-8')
def parse(self, stream, full_statusline=None): """ parse stream for status line and headers return a StatusAndHeaders object support continuation headers starting with space or tab """ def readline(): return to_native_str(stream.readline()) # status line w newlines intact if full_statusline is None: full_statusline = readline() else: full_statusline = to_native_str(full_statusline) statusline, total_read = _strip_count(full_statusline, 0) headers = [] # at end of stream if total_read == 0: raise EOFError() elif not statusline: return StatusAndHeaders(statusline=statusline, headers=headers, protocol='', total_len=total_read) # validate only if verify is set if self.verify: protocol_status = self.split_prefix(statusline, self.statuslist) if not protocol_status: msg = 'Expected Status Line starting with {0} - Found: {1}' msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, full_statusline) else: protocol_status = statusline.split(' ', 1) line, total_read = _strip_count(readline(), total_read) while line: result = line.split(':', 1) if len(result) == 2: name = result[0].rstrip(' \t') value = result[1].lstrip() else: name = result[0] value = None next_line, total_read = _strip_count(readline(), total_read) # append continuation lines, if any while next_line and next_line.startswith((' ', '\t')): if value is not None: value += next_line next_line, total_read = _strip_count(readline(), total_read) if value is not None: header = (name, value) headers.append(header) line = next_line if len(protocol_status) > 1: statusline = protocol_status[1].strip() else: statusline = '' return StatusAndHeaders(statusline=statusline, headers=headers, protocol=protocol_status[0], total_len=total_read)
def handle_connect(self, env): sock = self.get_request_socket(env) if not sock: return WbResponse.text_response('HTTPS Proxy Not Supported', '405 HTTPS Proxy Not Supported') sock.send(b'HTTP/1.0 200 Connection Established\r\n') sock.send(b'Proxy-Connection: close\r\n') sock.send(b'Server: pywb proxy\r\n') sock.send(b'\r\n') hostname, port = env['REL_REQUEST_URI'].split(':') if not self.use_wildcard: certfile = self.ca.cert_for_host(hostname) else: certfile = self.ca.get_wildcard_cert(hostname) try: ssl_sock = ssl.wrap_socket( sock, server_side=True, certfile=certfile, #ciphers="ALL", suppress_ragged_eofs=False, ssl_version=ssl.PROTOCOL_SSLv23) env['pywb.proxy_ssl_sock'] = ssl_sock buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) statusline = to_native_str(buffreader.readline().rstrip()) except Exception as se: raise BadRequestException(se.message) statusparts = statusline.split(' ') if len(statusparts) < 3: raise BadRequestException('Invalid Proxy Request: ' + statusline) env['REQUEST_METHOD'] = statusparts[0] env['REL_REQUEST_URI'] = ('https://' + env['REL_REQUEST_URI'].replace(':443', '') + statusparts[1]) env['SERVER_PROTOCOL'] = statusparts[2].strip() env['pywb.proxy_scheme'] = 'https' env['pywb.proxy_host'] = hostname env['pywb.proxy_port'] = port env['pywb.proxy_req_uri'] = statusparts[1] queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' env['pywb.proxy_query'] = env['QUERY_STRING'] while True: line = to_native_str(buffreader.readline()) if line: line = line.rstrip() if not line: break parts = line.split(':', 1) if len(parts) < 2: continue name = parts[0].strip() value = parts[1].strip() name = name.replace('-', '_').upper() if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'): name = 'HTTP_' + name env[name] = value env['wsgi.input'] = buffreader
def readline(): return to_native_str(stream.readline())
def __str__(self): return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
def _fuzzy_query_call(self, query): # imports added here for brozzler from pywb.utils.loaders import to_native_str from six.moves.urllib.parse import urlsplit, urlunsplit matched_rule = None urlkey = to_native_str(query.key, 'utf-8') url = query.url filter_ = query.filters output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue matched_rule = rule groups = m.groups() for g in groups: for f in matched_rule.filter: filter_.append(f.format(g)) break if not matched_rule: return None repl = '?' if matched_rule.replace: repl = matched_rule.replace inx = url.find(repl) if inx > 0: url = url[:inx + len(repl)] # begin brozzler changes if matched_rule.match_type == 'domain': orig_split_url = urlsplit(url) # remove the subdomain, path, query and fragment host = orig_split_url.netloc.split('.', 1)[1] new_split_url = (orig_split_url.scheme, host, '', '', '') url = urlunsplit(new_split_url) # end brozzler changes params = query.params params.update({ 'url': url, 'matchType': matched_rule.match_type, 'filter': filter_ }) if 'reverse' in params: del params['reverse'] if 'closest' in params: del params['closest'] if 'end_key' in params: del params['end_key'] return params