def filter_query_params(url: str, parsed: up.ParseResult) -> up.ParseResult: """ Remove the following query params from an URL: * ``sid=\w+``: SIDs are mostly used by magento to track the users when cookies are disabled * ``s=\w{32}``: same as SID, but from vBulletin sites * ``replytocom=\d+``: used by wordpress when clicking on "answer" from a comment :param url: the url :return: the url without sid and the likes """ if any(l in parsed.query for l in ['s=', 'sid=', 'replytocom=']): # TODO: here, the behavior of parse is inconsistant/changes the URL # e.g.: # >>> up.parse_qsl('a=%7E_%7E%3B') # [('a', '~_~;')] # >>> up.urlencode([('a', '~_~;')]) # 'a=~_~%3B' qs = up.parse_qsl(parsed.query) new_qs = up.urlencode([ q for q in qs if not ((q[0] == 'sid') or # magento (q[0] == 's' and len(q[1]) == 32) or # vBulletin (q[0] == 'replytocom') # wordpress ) ]) return parsed._replace(query=new_qs) return parsed
def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult: """Apply the actual transformation process to the url.""" query_params = parse_qs(parsed_url.query, keep_blank_values=True) query_params.pop("fbclid", None) return parsed_url._replace(query=urlencode(query_params, doseq=True))
def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult: """Apply the actual transformation process to the url.""" assert parsed_url.hostname is not None # mypy workaround new_domain = parsed_url.hostname.replace(".m.wikipedia.org", ".wikipedia.org") return parsed_url._replace(netloc=new_domain)
def __pre_url(parsed_url: parse.ParseResult, page: int): """Set the page parameter of a url to the previous page""" query = parsed_url.query if page == 1: new_query = APIPagination.__set_page(query, page) else: new_query = APIPagination.__set_page(query, page - 1) return parsed_url._replace(query=new_query)
def __next_url(parsed_url: parse.ParseResult, page: int, total_pages: int): """Set the page parameter of a url to the last page""" query = parsed_url.query if page == total_pages: new_query = APIPagination.__set_page(query, page) else: new_query = APIPagination.__set_page(query, page + 1) return parsed_url._replace(query=new_query)
def canonical_url(url): 'Converts a string to a Cargo Canonical URL, as per https://github.com/rust-lang/cargo/blob/35c55a93200c84a4de4627f1770f76a8ad268a39/src/cargo/util/canonical_url.rs#L19' # Hrm. The upstream cargo does not replace those URLs, but if we don't then it doesn't work too well :( url = url.replace('git+https://', 'https://') u = urlparse(url) # It seems cargo drops query and fragment u = ParseResult(u.scheme, u.netloc, u.path, None, None, None) u = u._replace(path=u.path.rstrip('/')) if u.netloc == 'github.com': u = u._replace(scheme='https') u = u._replace(path=u.path.lower()) if u.path.endswith('.git'): u = u._replace(path=u.path[:-len('.git')]) return u
def canonical_url(url): "Converts a string to a Cargo Canonical URL, as per https://github.com/rust-lang/cargo/blob/35c55a93200c84a4de4627f1770f76a8ad268a39/src/cargo/util/canonical_url.rs#L19" logging.debug("canonicalising %s", url) # Hrm. The upstream cargo does not replace those URLs, but if we don't then it doesn't work too well :( url = url.replace("git+https://", "https://") u = urlparse(url) # It seems cargo drops query and fragment u = ParseResult(u.scheme, u.netloc, u.path, None, None, None) u = u._replace(path=u.path.rstrip('/')) if u.netloc == "github.com": u = u._replace(scheme="https") u = u._replace(path=u.path.lower()) if u.path.endswith(".git"): u.path = u.path[:-len(".git")] return u
def fix_freedesktop_org_url(parsed: ParseResult, branch: Optional[str], subpath: Optional[str]): if parsed.netloc == "anongit.freedesktop.org": path = parsed.path if path.startswith("/git/"): path = path[len("/git"):] parsed = parsed._replace(netloc="gitlab.freedesktop.org", scheme="https", path=path) return parsed, branch, subpath return None, None, None
def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult: """Apply the actual transformation process to the url.""" query_params = parse_qs(parsed_url.query, keep_blank_values=True) cleaned_params = { param: value for param, value in query_params.items() if not param.startswith("utm_") } return parsed_url._replace(query=urlencode(cleaned_params, doseq=True))
def fix_twitter_url(url: str, parsed: up.ParseResult) -> Optional[up.ParseResult]: """ (this method does nothing on URLs outside of the twitter.com domain). [Potentially] modify the twitter.com URLs by applying the following: - ignore sharing intents (e.g. ``/intent``, ``/share``); - replace http by https; - strip specific subdomains (e.g. ``mobile``, ``www``); - strip specific query parameters (e.g. ``lang``); :param parsed: a parsed URL :return: the fixed URL or None if the URL should be ignored """ if 'twitter.com' not in parsed.netloc: return parsed # remove twitter.com/share and twitter.com/intent, both used for sharing if parsed.path.startswith('/intent') or parsed.path.startswith('/share'): return None # make it https if parsed.scheme != 'https': parsed = parsed._replace(scheme='https') # handle subdomains if '.twitter.com' in parsed.netloc: subdomain = parsed.netloc.replace('.twitter.com', '') if subdomain in _twitter_remap: parsed = _twitter_remap[subdomain](parsed, subdomain) if parsed is None: return None # strip uninteresting query parameters qs = up.parse_qsl(parsed.query) if len(qs): parsed = parsed._replace( query=up.urlencode([(k, v) for k, v in qs if k not in _twitter_qs_blacklist])) return parsed
def fix_path_in_port(parsed: ParseResult, branch: Optional[str], subpath: Optional[str]): if ":" not in parsed.netloc or parsed.netloc.endswith("]"): return None, None, None host, port = parsed.netloc.rsplit(":", 1) if host.split("@")[-1] not in (KNOWN_GITLAB_SITES + ["github.com"]): return None, None, None if not port or port.isdigit(): return None, None, None return ( parsed._replace(path="%s/%s" % (port, parsed.path.lstrip("/")), netloc=host), branch, subpath, )
def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult: """Apply the actual transformation process to the url. This converts a url like https://youtu.be/asdf to https://www.youtube.com/watch?v=asdf (and retains any other query params). """ video_id = parsed_url.path.strip("/") # use parse_qsl() and insert() here so the v= is always the first query param query_params = parse_qsl(parsed_url.query, keep_blank_values=True) query_params.insert(0, ("v", video_id)) return parsed_url._replace(netloc="www.youtube.com", path="/watch", query=urlencode(query_params))
def fix_fb_url(url: str, parsed: up.ParseResult) -> Optional[up.ParseResult]: """ (this method does nothing on URLs outside of the facebook.com domain). [Potentially] modify the twitter.com URLs by applying the following: - ignore specific subdomains (e.g. ``login``, ``graph``); - replace http by https; - remap specifiy subdomains to www (e.g. languages such as ``de-de``, other such as ``m``, ``touch``); - extract and return the redirect URL targeted by ``l.facebook.com``; - strip off all query parameters (TODO) :param parsed: a parsed URL :return: the fixed URL or None if the URL should be ignored """ if 'facebook.com' not in parsed.netloc: return parsed # make it https if parsed.scheme != 'https': parsed = parsed._replace(scheme='https') # extract subdomain subdomain = parsed.netloc.replace('facebook.com', '') if subdomain.endswith('.'): subdomain = subdomain[:-1] # handle subdomains if subdomain in _facebook_remap: parsed = _facebook_remap[subdomain](parsed, subdomain) elif re.match('[a-z]{2}-[a-z]{2}', subdomain): parsed = _fb_remap(parsed, subdomain) if parsed is None: return None # strip off all query parameters (TODO: really that clever ?) parsed = parsed._replace(query='') return parsed
def path_as_href(path: str, into_url: ParseResult = None) -> str: """Returns the string to use for referring to the given path in a file. This percent-encodes characters as necessary to make the path a valid URL. If into_url is provided, it copies every part of that URL except the path into the resulting URL. Note that if into_url contains a scheme or netloc, the given path must be absolute. """ urlpath = quote(path) if into_url: if (into_url.scheme or into_url.netloc) and not os.path.isabs(path): raise ValueError( f'Cannot put a relative path [{path}]' f'into a URL with scheme or host/port [{into_url}]') return urlunparse(into_url._replace(path=urlpath)) return urlpath
def build_return_url(redirect_uri: ParseResult, **params: Optional[str]) -> str: """Construct a return URL for a redirect. Parameters ---------- redirect_uri : `urllib.parse.ParseResult` The parsed return URI from the client. **params : `str` or `None` Additional parameters to add to that URI to create the return URL. Any parameters set to `None` will be ignored. Returns ------- return_url : `str` The return URL to which the user should be redirected. """ query = parse_qsl(redirect_uri.query) if redirect_uri.query else [] query.extend(((k, v) for (k, v) in params.items() if v is not None)) return_url = redirect_uri._replace(query=urlencode(query)) return return_url.geturl()
def to_pf_url(url: ParseResult): """ Returns *P*ath and *F*ile as defined here: https://gist.github.com/andrewdotn/eebeaa60d48c3c0f6f9fc75f0ede8d03#proposal """ return urlunparse(url._replace(scheme="", netloc=""))
def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult: """Apply the actual transformation process to the url.""" return parsed_url._replace(netloc="twitter.com")
def remove_target_url(url: ParseResult) -> ParseResult: parsed_query = parse_qs(url.query) parsed_query.pop('targetUrl') return url._replace(query=urlencode(parsed_query))
def set_target_url(url: ParseResult, targetUrl: str) -> ParseResult: parsed_query = parse_qs(url.query) parsed_query['targetUrl'] = targetUrl return url._replace(query=urlencode(parsed_query))
def __last_url(parsed_url: parse.ParseResult, total_pages: int) -> parse.ParseResult: """Set the page parameter of a url to the last page""" query = parsed_url.query new_query = APIPagination.__set_page(query, total_pages) return parsed_url._replace(query=new_query)
def __first_url(parsed_url: parse.ParseResult, first_page: int = 1) -> parse.ParseResult: """Set the page parameter of a url to the first page""" query = parsed_url.query new_query = APIPagination.__set_page(query, first_page) return parsed_url._replace(query=new_query)