def preprocess_url(self, referrer, url): ''' Clean and filter URLs before scraping. ''' ignoreList = ['.pdf', '.jpg', 'tel:', '.dmg'] if not url: return None fields = urlsplit(urljoin( referrer, url))._asdict() # convert to absolute URLs and split fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing / fields['fragment'] = '' # remove targets within a page fields = SplitResult(**fields) if fields.netloc == self.domain: # Scrape pages of current domain only if fields.scheme == 'http': httpurl = cleanurl = fields.geturl() httpsurl = httpurl.replace('http:', 'https:', 1) else: httpsurl = cleanurl = fields.geturl() httpurl = httpsurl.replace('https:', 'http:', 1) for item in ignoreList: if item in httpsurl or item in httpurl: return None if httpurl not in self.urls and httpsurl not in self.urls: # Return URL only if it's not already in list return cleanurl return None
def normalize(uristr): """ Translate the given URI into a normalized form. :type uristr: unicode :rtype: unicode """ # Strip proxy prefix for proxied URLs for scheme in URL_SCHEMES: if uristr.startswith(VIA_PREFIX + scheme + ":"): uristr = uristr[len(VIA_PREFIX):] break # Try to extract the scheme uri = urlsplit(uristr) # If this isn't a URL, we don't perform any normalization if uri.scheme.lower() not in URL_SCHEMES: return uristr # Don't perform normalization on URLs with no hostname. if uri.hostname is None: return uristr scheme = _normalize_scheme(uri) netloc = _normalize_netloc(uri) path = _normalize_path(uri) query = _normalize_query(uri) fragment = None uri = SplitResult(scheme, netloc, path, query, fragment) return uri.geturl()
def _add_params_to_url(url, params): """Adds parameters as a query part of the URL :param url: URL :type url: string :param params: Dictionary containing parameters :type params: Dict :return: URL with parameters formatted as a query string :rtype: string """ url_parts = urlsplit(url) # Extract the existing parameters specified in the redirection URI existing_params = parse_qs(url_parts.query) # Enrich our custom parameters with the existing ones params.update(existing_params) new_query = urlencode(params, True) url_parts = SplitResult( url_parts.scheme, url_parts.netloc, url_parts.path, new_query, url_parts.fragment, ) url = url_parts.geturl() return url
def url(self): protocol = self.get('HTTP_X_FORWARDED_PROTO') or self.get( 'wsgi.url_scheme', 'http') host = self.get('HTTP_X_FORWARDED_HOST') or self.get('HTTP_HOST') query_params = self.get("QUERY_STRING") url_split_result = SplitResult(protocol, host, self.path, query_params, '') return url_split_result.geturl()
class UrlBuilder: def __init__(self, url: typing.Optional[str] = None) -> None: if url is None: self.splitted = SplitResult("", "", "", "", "") else: self.splitted = urlsplit(url) self.query = parse_qsl(self.splitted.query) def set(self, scheme: str = None, netloc: str = None, path: str = None, query: typing.List[typing.Tuple[str, str]] = None, fragment: str = None) -> "UrlBuilder": kwargs = dict() if scheme is not None: kwargs['scheme'] = scheme if netloc is not None: kwargs['netloc'] = netloc if path is not None: kwargs['path'] = path if query is not None: self.query = query if fragment is not None: kwargs['fragment'] = fragment self.splitted = self.splitted._replace(**kwargs) return self def has_query(self, needle_query_name: str) -> bool: """Returns True iff the URL being constructed has a query field with name `needle_query_name`.""" for query_name, _ in self.query: if query_name == needle_query_name: return True return False def add_query(self, query_name: str, query_value: str) -> "UrlBuilder": self.query.append((query_name, query_value)) return self def replace_query(self, query_name: str, query_value: str) -> "UrlBuilder": """ Given a query name, remove all instances of that query name in this UrlBuilder. Then append an instance with the name set to `query_name` and the value set to `query_value`. """ self.query = [(q_name, q_value) for q_name, q_value in self.query if q_name != query_name] self.query.append((query_name, query_value)) return self def __str__(self) -> str: result = self.splitted._replace( query=urlencode(self.query, doseq=True)) return urlunsplit(result)
def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: if url[:i] == 'http': # optimize the common case scheme = url[:i].lower() url = url[i + 1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v for c in url[:i]: if c not in scheme_chars: break else: # make sure "url" is not actually a port number (in which case # "scheme" is really part of the path) rest = url[i + 1:] if not rest or any(c not in '0123456789' for c in rest): # not a port number scheme, url = url[:i].lower(), rest if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v
def _urlsplit(url: str, scheme: str = '', allow_fragments: bool = True): """Templating safe version of urllib.parse.urlsplit Ignores '?' and '#' inside {{}} templating tags. Caching disabled. """ url, scheme, _coerce_result = _coerce_args(url, scheme) allow_fragments = bool(allow_fragments) netloc = query = fragment = '' i = url.find(':') if i > 0: scheme, url = _urlsplit_scheme(url, i) if url[:2] == '//': netloc, url = _urlsplit_netloc(url) if allow_fragments and '#' in url: fragment, url = _urlsplit_fragment(url) if '?' in url: query, url = _urlsplit_query(url) v = SplitResult(scheme, netloc, url, query, fragment) return _coerce_result(v)
def parse_element(platform: str, element: str): if platform == 'github': return { 'href': f'https://github.com/sponsors/{element}', 'text': f'{element} on Github' } elif platform == 'patreon': return { 'href': f'https://www.patreon.com/{element}', 'text': f'{element} on Patreon' } elif platform == 'ko_fi': return { 'href': f'https://ko-fi.com/{element}', 'text': f'{element} on Ko-fi' } elif platform == 'custom': comps = urlsplit(element) if not comps.scheme: netloc, path = (comps.path.split('/', 1) + [''])[:2] comps = SplitResult(scheme='https', netloc=netloc, path=path, query=comps.query, fragment=comps.fragment) return { 'href': urlunsplit(comps), 'text': f'{comps.netloc}/{comps.path}'.rstrip('/') } else: return None
def test_customer_missing_name(self): """Should return error message if there is not enough information about customer name""" splittedURL = SplitResult("http", "fsecure.com", "/", "", "") result = signer._process_signed_url(splittedURL, "Lebron", "aaa") self.assertEqual(result, "Not enough customer's name information")
def get_local_referrer(request): """Get the referrer URL if it is not external to this application""" if "HTTP_REFERER" in request.META: # Adopted from here: # https://github.com/django/django/blob/7fc317ae736e8fda1aaf4d4ede84d95fffaf5281/django/http/request.py#L124-L130 allowed_hosts = settings.ALLOWED_HOSTS if settings.DEBUG and not allowed_hosts: allowed_hosts = [".localhost", "127.0.0.1", "[::1]"] referrer = urlsplit(request.META["HTTP_REFERER"]) if validate_host(referrer.hostname, allowed_hosts): # If the referrer header points to "ourselves", return a "safe copy" # removing everything but the path+query+fragment part return SplitResult( scheme="", netloc="", path=referrer.path, query=referrer.query, fragment=referrer.fragment, ).geturl() else: return None else: return None
def list_files(self, first_date: dt.date, last_date: dt.date, test_type: str, country: str) -> Iterable[FileEntry]: if first_date > dt.date(2020, 10, 21): return paginator = self._s3_client.get_paginator('list_objects_v2') pages = paginator.paginate( Bucket=_LegacyOoniClient._BUCKET, Delimiter='/', Prefix=_LegacyOoniClient._PREFIX, StartAfter= f'{_LegacyOoniClient._PREFIX}{first_date.strftime("%Y-%m-%d")}') for page in pages: self.num_list_requests += 1 for entry in page.get('CommonPrefixes', []): date_dir = entry['Prefix'] date_str = posixpath.basename(posixpath.dirname(date_dir)) date = dt.datetime.strptime(date_str, "%Y-%m-%d").date() if date > last_date: return for file_entry in self._list_files_with_index( date_dir, test_type, country): url = SplitResult( 's3', _LegacyOoniClient._BUCKET, f'{_LegacyOoniClient._PREFIX}{file_entry["filename"]}', None, None) yield FileEntry( lambda: self._get_measurements(file_entry), test_type, country, date, url, _LegacyOoniClient._frame_bytes(file_entry['frames']))
def build_url(netloc, port=None, scheme=None, path="", query=None, fragment=""): """ Builds a url given all its parts :netloc: string :port: int :scheme: string :path: string :query: A dictionary with any GET parameters :fragment: string :return: URL string """ if query: query = quote_plus(urlencode(query)) if scheme is None: scheme = GlobalRequestMiddleware.get_current_request().scheme if port is not None: netloc = replace_netloc_port(netloc, port) return SplitResult( scheme=scheme, netloc=netloc, path=path, query=query or "", fragment=fragment, ).geturl()
def build(cls, *, scheme='', user='', password='', host='', port=None, path='', query=None, query_string='', fragment='', strict=False): """Creates and returns a new URL""" if host and not scheme: raise ValueError( 'Can\'t build URL with "host" but without "scheme".') if not host and scheme: raise ValueError( 'Can\'t build URL with "scheme" but without "host".') if query and query_string: raise ValueError( "Only one of \"query\" or \"query_string\" should be passed") url = cls(SplitResult(scheme, cls._make_netloc(user, password, host, port), _quote(path, safe='@:', protected='/'), _quote(query_string), fragment), strict=strict, encoded=True) if query: return url.with_query(query) else: return url
def __init__( self, url, token=None, username=None, password=None, timeout=DEFAULT_TIMEOUT, verify=True, user_agent=ZMON_USER_AGENT): """Initialize ZMON client.""" self.timeout = timeout split = urlsplit(url) self.base_url = urlunsplit(SplitResult(split.scheme, split.netloc, '', '', '')) self.url = urljoin(self.base_url, self._join_path(['api', API_VERSION, ''])) self._session = requests.Session() self._timeout = timeout self.user_agent = user_agent if username and password and token is None: self._session.auth = (username, password) self._session.headers.update({'User-Agent': user_agent, 'Content-Type': 'application/json'}) if token: self._session.headers.update({'Authorization': 'Bearer {}'.format(token)}) if not verify: logger.warning('ZMON client will skip SSL verification!') requests.packages.urllib3.disable_warnings() self._session.verify = False
def __init__(self, schema, netloc, port, path, query, fragment, userinfo): self._strict = False if port: netloc += ':{}'.format(port) if userinfo: netloc = yarl.quote( userinfo, safe='@:', protected=':', strict=False) + '@' + netloc if path: path = yarl.quote(path, safe='@:', protected='/', strict=False) if query: query = yarl.quote(query, safe='=+&?/:@', protected=yarl.PROTECT_CHARS, qs=True, strict=False) if fragment: fragment = yarl.quote(fragment, safe='?/:@', strict=False) self._val = SplitResult( schema or '', # scheme netloc=netloc, path=path, query=query, fragment=fragment) self._cache = {}
def human_repr(self): return url_unsplit( SplitResult( self.scheme, self._make_netloc(self.user, self.password, self.host, self._val.port), self.path, self.query_string, self.fragment))
def human_repr(self): """Return decoded human readable string for URL representation.""" user = _human_quote(self.user, "#/:?@") password = _human_quote(self.password, "#/:?@") host = self.host if host: host = self._encode_host(self.host, human=True) path = _human_quote(self.path, "#?") query_string = "&".join( "{}={}".format(_human_quote(k, "#&+;="), _human_quote(v, "#&+;=")) for k, v in self.query.items()) fragment = _human_quote(self.fragment, "") return urlunsplit( SplitResult( self.scheme, self._make_netloc( user, password, host, self._val.port, encode_host=False, ), path, query_string, fragment, ))
def delete(url): url = unquote(url) match, project = Cache.match(url) if match: path = Cache.path(url, project, include_file=True) # Rather then wait for last updated statistics to expire, remove the # project cache if applicable. if project: apiurl, _ = Cache.spliturl(url) if project.isdigit(): # Clear target project cache upon request acceptance. project = osc.core.get_request( apiurl, project).actions[0].tgt_project Cache.delete_project(apiurl, project) if os.path.exists(path): if conf.config['debug']: print('CACHE_DELETE', url, file=sys.stderr) os.remove(path) # Also delete version without query. This does not handle other # variations using different query strings. Handy for PUT with ?force=1. o = urlsplit(url) if o.query != '': url_plain = SplitResult(o.scheme, o.netloc, o.path, '', o.fragment).geturl() Cache.delete(url_plain)
def presign_v4( method, url, region, credentials, date, expires, ): """Do signature V4 of given presign request.""" scope = _get_scope(date, region, "s3") canonical_request_hash, url = _get_presign_canonical_request_hash( method, url, credentials.access_key, scope, date, expires, ) string_to_sign = _get_string_to_sign(date, scope, canonical_request_hash) signing_key = _get_signing_key(credentials.secret_key, date, region, "s3") signature = _get_signature(signing_key, string_to_sign) parts = list(url) parts[3] = url.query + "&X-Amz-Signature=" + queryencode(signature) url = SplitResult(*parts) return url
def _urlsplit(url, scheme="", allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" url, scheme, _coerce_result = _coerce_args(url, scheme) allow_fragments = bool(allow_fragments) netloc = query = fragment = "" i = url.find(":") if i > 0: for c in url[:i]: if c not in scheme_chars: break else: scheme, url = url[:i].lower(), url[i + 1:] if url[:2] == "//": netloc, url = _splitnetloc(url, 2) if ("[" in netloc and "]" not in netloc) or ("]" in netloc and "[" not in netloc): raise ValueError("Invalid IPv6 URL") if allow_fragments and "#" in url: url, fragment = url.split("#", 1) if "?" in url: url, query = url.split("?", 1) v = SplitResult(scheme, netloc, url, query, fragment) return _coerce_result(v)
def __init__(self, arg1051, arg1000, arg682, arg2280, arg192, arg1897, arg551): self.attribute1287 = False if arg682: arg1000 += ':{}'.format(arg682) if arg551: arg1000 = ( (yarl.quote(arg551, safe='@:', protected=':', strict=False) + '@') + arg1000) if arg2280: arg2280 = yarl.quote(arg2280, safe='@:', protected='/', strict=False) if arg192: arg192 = yarl.quote(arg192, safe='=+&?/:@', protected=yarl.PROTECT_CHARS, qs=True, strict=False) if arg1897: arg1897 = yarl.quote(arg1897, safe='?/:@', strict=False) self.attribute914 = SplitResult((schema or ''), netloc=arg1000, path=arg2280, query=arg192, fragment=arg1897) self.attribute233 = {}
def search_pagination_link_url(context, start): """ This simple tag analyses the full path of the request (which caused that the template with this search_pagination_url tag was rendered). This tag returns the link of the search request with the given start parameter (the old one will be overridden, if the start parameter already exists). Except of the start parameter the returned link is the same as the search request url. This tag can be used to generate the links of the search pagination. :param context: the used context. :param start: the value, which the start parameter of the search request shall have. :return: the given url with the new given start parameter. """ scheme, netloc, path, query, fragment = urlsplit( context['request'].get_full_path()) url_params = parse_qs(query, strict_parsing=False) if 'search_phrase' in url_params: url_params['search_for'] = url_params['search_for'][0] url_params['search_phrase'] = url_params['search_phrase'][0] url_params['start'] = start query = urlencode(url_params) return urlunsplit( SplitResult(scheme=scheme, netloc=netloc, path=path, query=query, fragment=fragment)) else: raise ValueError( 'The pagination link can only be computed for search result requests !' )
def add_query(self, pair_list): split = urlsplit(self.url) qs = parse_qs(split[3]).keys() qsl = parse_qsl(split[3]) added = set() for (add_name, add_value) in pair_list: if add_name not in qs: added.add(add_name) new_qsl = list() for (name, value) in qsl: for (add_name, add_value) in pair_list: if add_name == name: value = add_value new_qsl.append(tuple([name, value])) for (add_name, add_value) in pair_list: if add_name in added: new_qsl.append(tuple([add_name, add_value])) new_query = urlencode(new_qsl) self.url = urlunsplit( SplitResult(scheme=split[0], netloc=split[1], path=split[2], query=new_query, fragment=split[4]))
def _urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" url, scheme, _coerce_result = _coerce_args(url, scheme) netloc = query = fragment = '' i = url.find(':') if i > 0: for c in url[:i]: if c not in scheme_chars: break else: scheme, url = url[:i].lower(), url[i + 1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) return _coerce_result(v)
def __new__(cls, val='', encoded=False): if isinstance(val, cls): return val self = object.__new__(cls) if isinstance(val, str): val = url_split(val) elif isinstance(val, SplitResult): if not encoded: raise ValueError('Cannot apply decoding to SplitResult') else: raise TypeError( f'Constructor parameter should be str, got {val.__class__.__name__}' ) if not encoded: if not val[1]: netloc = '' else: netloc = val.hostname if netloc is None: raise ValueError( 'Invalid URL: host is required for abolute urls.') try: netloc.encode('ascii') except UnicodeEncodeError: netloc = netloc.encode('idna').decode('ascii') else: try: ip = ip_address(netloc) except: pass else: if ip.version == 6: netloc = f'[{netloc}]' val_port = val.port if val_port: netloc = f'{netloc}:{val_port}' val_username = val.username if val_username: user = quote(val_username) val_password = val.password if val_password: user = f'{user}:{quote(val_password)}' netloc = f'{user}@{netloc}' val = SplitResult(val[0], netloc, quote(val[2], safe='@:', protected='/'), query=quote(val[3], safe='=+&?/:@', protected='=+&', query_string=True), fragment=quote(val[4], safe='?/:@')) self._val = val self._cache = {} return self
def report_error(request: HttpRequest, user_profile: UserProfile, message: str=REQ(), stacktrace: str=REQ(), ui_message: bool=REQ(validator=check_bool), user_agent: str=REQ(), href: str=REQ(), log: str=REQ(), more_info: Mapping[str, Any]=REQ(validator=check_dict([]), default={}), ) -> HttpResponse: """Accepts an error report and stores in a queue for processing. The actual error reports are later handled by do_report_error""" if not settings.BROWSER_ERROR_REPORTING: return json_success() more_info = dict(more_info) js_source_map = get_js_source_map() if js_source_map: stacktrace = js_source_map.annotate_stacktrace(stacktrace) try: version: Optional[str] = subprocess.check_output( ["git", "show", "-s", "--oneline"], universal_newlines=True, ) except (FileNotFoundError, subprocess.CalledProcessError): version = None # Get the IP address of the request remote_ip = request.META['REMOTE_ADDR'] # For the privacy of our users, we remove any actual text content # in draft_content (from drafts rendering exceptions). See the # comment on privacy_clean_markdown for more details. if more_info.get('draft_content'): more_info['draft_content'] = privacy_clean_markdown(more_info['draft_content']) if user_profile.is_authenticated: email = user_profile.delivery_email full_name = user_profile.full_name else: email = "*****@*****.**" full_name = "Anonymous User" queue_json_publish('error_reports', dict( type = "browser", report = dict( host = SplitResult("", request.get_host(), "", "", "").hostname, ip_address = remote_ip, user_email = email, user_full_name = full_name, user_visible = ui_message, server_path = settings.DEPLOY_ROOT, version = version, user_agent = user_agent, href = href, message = message, stacktrace = stacktrace, log = log, more_info = more_info, ), )) return json_success()
def __init__(self, val='', *, encoded=False, strict=None): if strict is not None: # pragma: no cover warnings.warn("strict parameter is ignored") if isinstance(val, URL): self._val = val._val self._cache = val._cache return if isinstance(val, str): val = urlsplit(val) elif isinstance(val, SplitResult): if not encoded: raise ValueError("Cannot apply decoding to SplitResult") else: raise TypeError("Constructor parameter should be str") if not encoded: if not val[1]: # netloc netloc = '' else: netloc = val.hostname if netloc is None: raise ValueError( "Invalid URL: host is required for abolute urls.") try: netloc.encode('ascii') except UnicodeEncodeError: netloc = netloc.encode('idna').decode('ascii') else: try: ip = ip_address(netloc) except ValueError: pass else: if ip.version == 6: netloc = '[' + netloc + ']' if val.port: netloc += ':{}'.format(val.port) if val.username: user = _quote(val.username) else: user = '' if val.password: user += ':' + _quote(val.password) if user: netloc = user + '@' + netloc path = _quote(val[2], safe='+@:', protected='/+') if netloc: path = _normalize_path(path) query = _quote(val[3], safe='=+&?/:@', protected=PROTECT_CHARS, qs=True) fragment = _quote(val[4], safe='?/:@') val = SplitResult(val[0], netloc, path, query, fragment) self._val = val self._cache = {}
def process_url_query(text): """url参数过滤:处理url参数不同但页面相同的情况""" from urllib.parse import urlsplit, urlunsplit, SplitResult, parse_qs, urlencode res = urlsplit(text) query = parse_qs(res.query) query = {k: v[0] for k, v in query.items() if k not in ["p", "ret"]} return urlunsplit(SplitResult(*res[:3], urlencode(query), *res[4:]))
def origin(url): """ Return a copy of ``url`` with the path, query string and fragment removed. ``url`` is assumed to be an HTTP(S) URL. """ url_parts = urlsplit(url) return SplitResult(url_parts.scheme, url_parts.netloc, "", "", "").geturl()
def get_ui_url(path_extention): split = urlsplit(conf.app.url) split = SplitResult(scheme=split.scheme, netloc=split.netloc, path=split.path + path_extention, query=split.query, fragment=split.fragment) return urlunsplit(split)
def url(self): protocol = self.get('HTTP_X_FORWARDED_PROTO') or self.get('wsgi.url_scheme', 'http') host = self.get('HTTP_X_FORWARDED_HOST') or self.get('HTTP_HOST') query_params = self.get("QUERY_STRING") url_split_result = SplitResult(protocol, host, self.path, query_params, '') return url_split_result.geturl()