def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: if url[:i] == 'http': # optimize the common case scheme = url[:i].lower() url = url[i + 1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v for c in url[:i]: if c not in scheme_chars: break else: # make sure "url" is not actually a port number (in which case # "scheme" is really part of the path) rest = url[i + 1:] if not rest or any(c not in '0123456789' for c in rest): # not a port number scheme, url = url[:i].lower(), rest if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v
def url_parse(url): clear_cache() urlparse(url)
URLS_X = URLS + ['ftp://freebsd.org/releases/5.8'] if __name__ == "__main__": url_grammar = recover_grammar(url_parse, URLS_X, files=['urllib/parse.py']) if __name__ == "__main__": syntax_diagram(url_grammar) if __name__ == "__main__": clear_cache() with Tracer(URLS_X[0]) as tracer: urlparse(tracer.my_input) for i, t in enumerate(tracer.trace): if t[0] in {'call', 'line'} and 'parse.py' in str(t[2]) and t[3]: print(i, t[2]._t()[1], t[3:]) # ## Grammar Miner with Reassignment if __name__ == "__main__": print('\n## Grammar Miner with Reassignment')
if len(sys.argv) == 1: print('Usage: time_urlparse_file <filename>') exit(0) filename = sys.argv[1] total_url_count = 0 total_urllib = 0 total_f = 0 total_fc = 0 total_fcb = 0 curlparse.clear_cache() urlparse_fast.clear_cache() urlparse_urllib.clear_cache() start_all = time.time() for url in open(filename, 'r'): url_bytes = url.encode('utf-8') total_url_count += 1 start = time.time() urlparse_fast.urlparse(url) total_f += time.time() - start start = time.time() curlparse.urlparse(url) total_fc += time.time() - start start = time.time()