def __call__(self, params): mode = params.get('mode', 'index') if mode == 'list_sources': return {}, self.index_source.get_source_list(params), {} if mode != 'index': return {}, self.get_supported_modes(), {} output = params.get('output', self.DEF_OUTPUT) fields = params.get('fields') if fields and isinstance(fields, str): fields = fields.split(',') handler = self.OUTPUTS.get(output, fields) if not handler: errs = dict(last_exc=BadRequestException( 'output={0} not supported'.format(output))) return None, None, errs cdx_iter, errs = self._load_index_source(params) if not cdx_iter: return None, None, errs content_type, res = handler(cdx_iter, fields, params) out_headers = {'Content-Type': content_type} def check_str(lines): for line in lines: if isinstance(line, six.text_type): line = line.encode('utf-8') yield line return out_headers, check_str(res), errs
def load_cdx(self, query): if self.remote_processing: remote_query = query else: # Only send url and matchType to remote remote_query = CDXQuery(url=query.url, match_type=query.match_type) urlparams = remote_query.urlencode() try: request = urllib2.Request(self.remote_url + '?' + urlparams) if self.cookie: request.add_header('Cookie', self.cookie) response = urllib2.urlopen(request) except urllib2.HTTPError as e: if e.code == 403: raise AccessException('Access Denied') elif e.code == 404: # return empty list for consistency with other cdx sources # will be converted to 404 if no other retry return [] elif e.code == 400: raise BadRequestException() else: raise WbException('Invalid response from remote cdx server') return iter(response)
def load_index(self, params): """Loads the xml query index based on the supplied params :param dict[str, str] params: The query params :return: A list or generator of cdx objects :raises NotFoundException: If the query url is not found or the results of the query returns no cdx entries :raises BadRequestException: If the match type is not exact or prefix """ closest = params.get('closest') url = params.get('url', '') matchType = params.get('matchType', 'exact') if matchType == 'exact': query = self.EXACT_QUERY elif matchType == 'prefix': query = self.PREFIX_QUERY else: raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType)) try: limit = params.get('limit') if limit: query = 'limit:{0} '.format(limit) + query # OpenSearch API requires double-escaping # TODO: add option to not double escape if needed query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url)) self.logger.debug("Running query: %s" % query_url) response = self.session.get(query_url) response.raise_for_status() results = etree.fromstring(response.content) items = results.find('results') except Exception: if self.logger.getEffectiveLevel() == logging.DEBUG: import traceback traceback.print_exc() raise NotFoundException('url {0} not found'.format(url)) if not items: raise NotFoundException('url {0} not found'.format(url)) items = items.findall('result') if matchType == 'exact': cdx_iter = [self.convert_to_cdx(item) for item in items] if closest: cdx_iter = cdx_sort_closest(closest, cdx_iter, limit=10000) else: cdx_iter = self.prefix_query_iter(items) return cdx_iter
def load_cdx(self, **params): prefix = '' is_text = (params.get('output') == 'text') # lookup collection prefix filters = params.get('filter') if filters: for f in filters: if f.startswith('prefix:'): prefix = f[7:] # special path for list all if params.get('listColls') and is_text: colls = self._load_colls() return '\n'.join(colls) url = params['url'] # force http prefix if url.startswith(self.HTTPS_PREFIX): url = self.HTTP_PREFIX + url[len(self.HTTPS_PREFIX):] elif not url.startswith(self.HTTP_PREFIX): url = self.HTTP_PREFIX + url request_uri = self.warcbase_path request_uri += prefix request_uri += '*/' + url try: response = requests.get(request_uri) except Exception: raise WbException('Error reading from: ' + request_uri) if response.status_code != 200: if response.status_code == 500: self._invalid_collection(prefix) else: raise BadRequestException(('Invalid status code: {0}'. format(response.status_code))) if len(response.content) == 0: msg = ('No captures found for <b>{0}</b> in collection <i>{1}</i>'. format(url, prefix.strip('/'))) raise NotFoundException(msg, url=url) lines = response.content.rstrip().split('\n') if len(lines[0].split('\t')) != 3: self._invalid_collection(prefix) resp_iter = self.iter_cdx(lines, url) if is_text: resp_iter = self.iter_text(resp_iter) return resp_iter
def __call__(self, params): mode = params.get('mode', 'index') if mode == 'list_sources': return {}, self.index_source.get_source_list(params), {} if mode != 'index': return {}, self.get_supported_modes(), {} output = params.get('output', self.DEF_OUTPUT) fields = params.get('fields') if not fields: fields = params.get('fl') if fields and isinstance(fields, str): fields = fields.split(',') handler = self.OUTPUTS.get(output) if not handler: errs = dict(last_exc=BadRequestException( 'output={0} not supported'.format(output))) return None, None, errs cdx_iter = None try: cdx_iter, errs = self._load_index_source(params) except BadRequestException as e: errs = dict(last_exc=e) if not cdx_iter: return None, None, errs content_type, res = handler(cdx_iter, fields, params) out_headers = {'Content-Type': content_type} first_line = None try: # raise exceptions early so that they can be handled properly first_line = next(res) except StopIteration: pass except CDXException as e: errs = dict(last_exc=e) return None, None, errs def check_str(first_line, lines): if first_line is not None: if isinstance(first_line, six.text_type): first_line = first_line.encode('utf-8') yield first_line for line in lines: if isinstance(line, six.text_type): line = line.encode('utf-8') yield line return out_headers, check_str(first_line, res), errs
def _load_index_source(self, params): url = params.get('url') if not url: errs = dict(last_exc=BadRequestException('The "url" param is required')) return None, errs input_req = params.get('_input_req') if input_req: params['alt_url'] = input_req.include_method_query(url) return self.fuzzy(self.index_source, params)
def _load_index_source(self, params): url = params.get('url') if not url: errs = dict(last_exc=BadRequestException('The "url" param is required')) return None, errs input_req = params.get('_input_req') if input_req: params['alt_url'] = input_req.include_method_query(url) cdx_iter = self.fuzzy(self.index_source, params) acl_user = params['_input_req'].env.get("HTTP_X_PYWB_ACL_USER") if self.access_checker: cdx_iter = self.access_checker(cdx_iter, acl_user) return cdx_iter
def _parse_extra(self): if not self.wb_url: return if self.wb_url.type != self.wb_url.LATEST_REPLAY: return self.options['is_timegate'] = True accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME') if not accept_datetime: return try: timestamp = http_date_to_timestamp(accept_datetime) except Exception: raise BadRequestException('Invalid Accept-Datetime: ' + accept_datetime) self.wb_url.set_replay_timestamp(timestamp)
def handle_connect(self, env): sock = self.get_request_socket(env) if not sock: return WbResponse.text_response('HTTPS Proxy Not Supported', '405 HTTPS Proxy Not Supported') sock.send(b'HTTP/1.0 200 Connection Established\r\n') sock.send(b'Proxy-Connection: close\r\n') sock.send(b'Server: pywb proxy\r\n') sock.send(b'\r\n') hostname, port = env['REL_REQUEST_URI'].split(':') if not self.use_wildcard: certfile = self.ca.cert_for_host(hostname) else: certfile = self.ca.get_wildcard_cert(hostname) try: ssl_sock = ssl.wrap_socket( sock, server_side=True, certfile=certfile, #ciphers="ALL", suppress_ragged_eofs=False, ssl_version=ssl.PROTOCOL_SSLv23) env['pywb.proxy_ssl_sock'] = ssl_sock buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) statusline = to_native_str(buffreader.readline().rstrip()) except Exception as se: raise BadRequestException(se.message) statusparts = statusline.split(' ') if len(statusparts) < 3: raise BadRequestException('Invalid Proxy Request: ' + statusline) env['REQUEST_METHOD'] = statusparts[0] env['REL_REQUEST_URI'] = ('https://' + env['REL_REQUEST_URI'].replace(':443', '') + statusparts[1]) env['SERVER_PROTOCOL'] = statusparts[2].strip() env['pywb.proxy_scheme'] = 'https' env['pywb.proxy_host'] = hostname env['pywb.proxy_port'] = port env['pywb.proxy_req_uri'] = statusparts[1] queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' env['pywb.proxy_query'] = env['QUERY_STRING'] while True: line = to_native_str(buffreader.readline()) if line: line = line.rstrip() if not line: break parts = line.split(':', 1) if len(parts) < 2: continue name = parts[0].strip() value = parts[1].strip() name = name.replace('-', '_').upper() if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'): name = 'HTTP_' + name env[name] = value env['wsgi.input'] = buffreader