def clean_url(url): parsed = urlparse(url) query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] return urlunparse( (parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def extract_doi(url): match = regex.search(url.path) if match: return match.group(0) for _, v in parse_qsl(url.query): match = regex.search(v) if match: return match.group(0) return None
def extract_doi(url): match = regex.search(url.path) if match: return match.group(0) for _, v in parse_qsl(url.query): match = regex.search(v) if match: return match.group(0) return None
def clean_url(url): parsed = urlparse(url) query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def parse_url(url_string, google_hostname): # sanity check if url_string is None: return url_string # normal case parsed_url = urlparse(url_string) if (parsed_url.netloc in [google_hostname, ''] and parsed_url.path == redirect_path): query = dict(parse_qsl(parsed_url.query)) return query['q'] else: return url_string
def parse_url(url_string, google_hostname): # sanity check if url_string is None: return url_string # normal case parsed_url = urlparse(url_string) if (parsed_url.netloc in [google_hostname, ''] and parsed_url.path == redirect_path): query = dict(parse_qsl(parsed_url.query)) return query['q'] else: return url_string
def on_result(request, search, result): if 'parsed_url' not in result: return True query = result['parsed_url'].query if query == "": return True parsed_query = parse_qsl(query) changes = 0 for i, (param_name, _) in enumerate(list(parsed_query)): for reg in regexes: if reg.match(param_name): parsed_query.pop(i - changes) changes += 1 result['parsed_url'] = result['parsed_url']._replace(query=urlencode(parsed_query)) result['url'] = urlunparse(result['parsed_url']) break return True
def image_url_cleanup(url_string): parsed_url = urlparse(url_string) if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th': query = dict(parse_qsl(parsed_url.query)) return "https://www.bing.com/th?id=" + query.get('id') return url_string
def url_cleanup(url_string): parsed_url = urlparse(url_string) if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': query = dict(parse_qsl(parsed_url.query)) return query.get('url', None) return url_string
def image_url_cleanup(url_string): parsed_url = urlparse(url_string) if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th': query = dict(parse_qsl(parsed_url.query)) return "https://www.bing.com/th?id=" + query.get('id') return url_string
def url_cleanup(url_string): parsed_url = urlparse(url_string) if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': query = dict(parse_qsl(parsed_url.query)) return query.get('url', None) return url_string