def dash_R_cleanup(fs, ps, pic): import gc, copy_reg import _strptime, linecache, dircache import urlparse, urllib, urllib2, mimetypes, doctest import struct, filecmp from distutils.dir_util import _path_created # Restore some original values. warnings.filters[:] = fs copy_reg.dispatch_table.clear() copy_reg.dispatch_table.update(ps) sys.path_importer_cache.clear() sys.path_importer_cache.update(pic) # Clear assorted module caches. _path_created.clear() re.purge() _strptime._regex_cache.clear() urlparse.clear_cache() urllib.urlcleanup() urllib2.install_opener(None) dircache.reset() linecache.clearcache() mimetypes._default_mime_types() struct._cache.clear() filecmp._cache.clear() doctest.master = None # Collect cyclic trash. gc.collect()
def custom_scheme_redirect(url_redirect): # urlparse.urlsplit doesn't currently handle custom schemes, # which we want our callback URLs to support so mobile apps can register # their own callback scheme handlers. # See http://bugs.python.org/issue9374 # and http://stackoverflow.com/questions/1417958/parse-custom-uris-with-urlparse-python scheme = urlparse.urlsplit(url_redirect)[0] scheme_lists = [urlparse.uses_netloc, urlparse.uses_query, urlparse.uses_fragment, urlparse.uses_params, urlparse.uses_relative] scheme_lists_modified = [] # Modify urlparse's internal scheme lists so it properly handles custom schemes if scheme: for scheme_list in scheme_lists: if scheme not in scheme_list: scheme_list.append(scheme) scheme_lists_modified.append(scheme_list) # Clear cache before re-parsing url_redirect urlparse.clear_cache() # Grab flask/werkzeug redirect result redirect_result = redirect(url_redirect) # Restore previous urlparse scheme list for scheme_list in scheme_lists_modified: scheme_list.remove(scheme) return redirect_result
def custom_scheme_redirect(url_redirect): # urlparse.urlsplit doesn't currently handle custom schemes, # which we want our callback URLs to support so mobile apps can register # their own callback scheme handlers. # See http://bugs.python.org/issue9374 # and http://stackoverflow.com/questions/1417958/parse-custom-uris-with-urlparse-python scheme = urlparse.urlsplit(url_redirect)[0] scheme_lists = [ urlparse.uses_netloc, urlparse.uses_query, urlparse.uses_fragment, urlparse.uses_params, urlparse.uses_relative ] scheme_lists_modified = [] # Modify urlparse's internal scheme lists so it properly handles custom schemes if scheme: for scheme_list in scheme_lists: if scheme not in scheme_list: scheme_list.append(scheme) scheme_lists_modified.append(scheme_list) # Clear cache before re-parsing url_redirect urlparse.clear_cache() # Grab flask/werkzeug redirect result redirect_result = redirect(url_redirect) # Restore previous urlparse scheme list for scheme_list in scheme_lists_modified: scheme_list.remove(scheme) return redirect_result
def url_is_acceptable(self, url): parsed = urlparse.urlparse(url) # Work-around a nasty bug. urlparse() caches parsed results and returns them on future calls, # and if the cache isn't cleared here, then a unicode string gets added to the cache, which # freaks out cherrypy when it independently calls urlparse() with the same URL later. urlparse.clear_cache() return parsed[0] in self.allowed_schemes
def url_is_acceptable(self,url): parsed = urlparse.urlparse(url) # Work-around a nasty bug. urlparse() caches parsed results and returns them on future calls, # and if the cache isn't cleared here, then a unicode string gets added to the cache, which # freaks out cherrypy when it independently calls urlparse() with the same URL later. urlparse.clear_cache() return parsed[0] in self.allowed_schemes
def urlsplit(url, scheme="", allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = "" i = url.find(":") if i > 0: if url[:i] == "http": # optimize the common case scheme = url[:i].lower() url = url[i + 1 :] if url[:2] == "//": netloc, url = _splitnetloc(url, 2) if ("[" in netloc and "]" not in netloc) or ( "]" in netloc and "[" not in netloc ): raise ValueError("Invalid IPv6 URL") if allow_fragments and "#" in url: url, fragment = url.split("#", 1) if "?" in url: url, query = url.split("?", 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v for c in url[:i]: if c not in scheme_chars: break else: # make sure "url" is not actually a port number (in which case # "scheme" is really part of the path) rest = url[i + 1 :] if not rest or any(c not in "0123456789" for c in rest): # not a port number scheme, url = url[:i].lower(), rest if url[:2] == "//": netloc, url = _splitnetloc(url, 2) if ("[" in netloc and "]" not in netloc) or ( "]" in netloc and "[" not in netloc ): raise ValueError("Invalid IPv6 URL") if allow_fragments and "#" in url: url, fragment = url.split("#", 1) if "?" in url: url, query = url.split("?", 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v
def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: if url[:i] == 'http': # optimize the common case scheme = url[:i].lower() url = url[i+1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v for c in url[:i]: if c not in scheme_chars: break else: # make sure "url" is not actually a port number (in which case # "scheme" is really part of the path) rest = url[i+1:] if not rest or any(c not in '0123456789' for c in rest): # not a port number scheme, url = url[:i].lower(), rest if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v
def dash_R_cleanup(fs, ps, pic, zdc, abcs): import gc, copy_reg import _strptime, linecache dircache = test_support.import_module('dircache', deprecated=True) import urlparse, urllib, urllib2, mimetypes, doctest import struct, filecmp from distutils.dir_util import _path_created # Clear the warnings registry, so they can be displayed again for mod in sys.modules.values(): if hasattr(mod, '__warningregistry__'): del mod.__warningregistry__ # Restore some original values. warnings.filters[:] = fs copy_reg.dispatch_table.clear() copy_reg.dispatch_table.update(ps) sys.path_importer_cache.clear() sys.path_importer_cache.update(pic) try: import zipimport except ImportError: pass # Run unmodified on platforms without zipimport support else: zipimport._zip_directory_cache.clear() zipimport._zip_directory_cache.update(zdc) # clear type cache sys._clear_type_cache() # Clear ABC registries, restoring previously saved ABC registries. for abc, registry in abcs.items(): abc._abc_registry = registry.copy() abc._abc_cache.clear() abc._abc_negative_cache.clear() # Clear assorted module caches. _path_created.clear() re.purge() _strptime._regex_cache.clear() urlparse.clear_cache() urllib.urlcleanup() urllib2.install_opener(None) dircache.reset() linecache.clearcache() mimetypes._default_mime_types() filecmp._cache.clear() struct._clearcache() doctest.master = None # Collect cyclic trash. gc.collect()
def dash_R_cleanup(fs, ps, pic, abcs): import gc, copy_reg import _strptime, linecache dircache = test_support.import_module('dircache', deprecated=True) import urlparse, urllib, urllib2, mimetypes, doctest import struct, filecmp from distutils.dir_util import _path_created # Clear the warnings registry, so they can be displayed again for mod in sys.modules.values(): if hasattr(mod, '__warningregistry__'): del mod.__warningregistry__ # Restore some original values. warnings.filters[:] = fs copy_reg.dispatch_table.clear() copy_reg.dispatch_table.update(ps) sys.path_importer_cache.clear() sys.path_importer_cache.update(pic) # clear type cache sys._clear_type_cache() # Clear ABC registries, restoring previously saved ABC registries. for abc, registry in abcs.items(): abc._abc_registry = registry.copy() abc._abc_cache.clear() abc._abc_negative_cache.clear() # Clear assorted module caches. _path_created.clear() re.purge() _strptime._regex_cache.clear() urlparse.clear_cache() urllib.urlcleanup() urllib2.install_opener(None) dircache.reset() linecache.clearcache() mimetypes._default_mime_types() filecmp._cache.clear() struct._clearcache() doctest.master = None if _llvm: code_types = (types.CodeType, types.FunctionType, types.MethodType) for obj in gc.get_objects(): if isinstance(obj, code_types): _llvm.clear_feedback(obj) # Collect cyclic trash. gc.collect()
def _safe_urlsplit(s): """the urlparse.urlsplit cache breaks if it contains unicode and we cannot control that. So we force type cast that thing back to what we think it is. """ rv = urlparse.urlsplit(s) # we have to check rv[2] here and not rv[1] as rv[1] will be # an empty bytestring in case no domain was given. if type(rv[2]) is not type(s): assert hasattr(urlparse, 'clear_cache') urlparse.clear_cache() rv = urlparse.urlsplit(s) assert type(rv[2]) is type(s) return rv
def cleanup(): import _strptime, urlparse, warnings, dircache from distutils.dir_util import _path_created _path_created.clear() warnings.filters[:] = fs gc.collect() sre.purge() _strptime._regex_cache.clear() urlparse.clear_cache() copy_reg.dispatch_table.clear() copy_reg.dispatch_table.update(ps) sys.path_importer_cache.clear() sys.path_importer_cache.update(pic) dircache.reset()
def trace_memory_clean_caches(self): """ Avoid polluting results with some builtin python caches """ urlparse.clear_cache() re.purge() linecache.clearcache() copy_reg.clear_extension_cache() if hasattr(fnmatch, "purge"): fnmatch.purge() # pylint: disable=no-member elif hasattr(fnmatch, "_purge"): fnmatch._purge() if hasattr(encodings, "_cache") and len(encodings._cache) > 0: encodings._cache = {} context.log.handler.flush()
def test_urlparse(self): """ For a given URL, L{http.urlparse} should behave the same as L{urlparse}, except it should always return C{str}, never C{unicode}. """ def urls(): for scheme in ('http', 'https'): for host in ('example.com',): for port in (None, 100): for path in ('', 'path'): if port is not None: host = host + ':' + str(port) yield urlunsplit((scheme, host, path, '', '')) def assertSameParsing(url, decode): """ Verify that C{url} is parsed into the same objects by both L{http.urlparse} and L{urlparse}. """ urlToStandardImplementation = url if decode: urlToStandardImplementation = url.decode('ascii') standardResult = urlparse(urlToStandardImplementation) scheme, netloc, path, params, query, fragment = http.urlparse(url) self.assertEqual( (scheme, netloc, path, params, query, fragment), standardResult) self.assertTrue(isinstance(scheme, str)) self.assertTrue(isinstance(netloc, str)) self.assertTrue(isinstance(path, str)) self.assertTrue(isinstance(params, str)) self.assertTrue(isinstance(query, str)) self.assertTrue(isinstance(fragment, str)) # With caching, unicode then str clear_cache() for url in urls(): assertSameParsing(url, True) assertSameParsing(url, False) # With caching, str then unicode clear_cache() for url in urls(): assertSameParsing(url, False) assertSameParsing(url, True) # Without caching for url in urls(): clear_cache() assertSameParsing(url, True) clear_cache() assertSameParsing(url, False)
def processURL(hypeIndex, urlToGet, domains, id): def isExtensionOkay(u): if u.endswith('.htm') or u.endswith('.html'): return True else: return False def getIndexableContent(soup): contents = [] allTags = soup.findAll(id='body') soup = BeautifulSoup(str(allTags[0])) allTags = soup.findAll() # Try and find the indexable contents for tag in allTags: for item in tag.contents: # Looking for leaf nodes if not hasattr(item, 'contents'): if item.__class__ == NavigableString: content = str(item).strip() if content: contents.append(content) contents = " ".join([str(s) for s in contents]) contents = re.sub(entityRE, "", contents) return contents def getTitle(soup): title = soup.find('title') if title: return title.string else: return '' def getLinkedPages(soup, u, domains): newPaths = [] anchors = soup.findAll('a') for a in anchors: try: href = a['href'] except KeyError: continue scheme, host, port, path = my_parse(href) if scheme in ('http', 'https', '') and host in domains: if path == '' or path[0] != '/': # relative path pathList = u.pathList()[:-1] currpath = '/'.join(pathList) if currpath: currpath = '/' + currpath path = currpath + '/' + path path = n_url.normURLPath(path) args = n_url.URL.fromString(path).queryList() path = '/'+'/'.join(n_url.URL.fromString(path).pathList()) query = '' for arg in args: if arg[0] in ['page']: query = '?page=%s'%arg[1] path = path.encode('ascii') path = urllib.quote(path)+query.encode('ascii') newPaths.append(path) else: # print '** Ignore', href pass return newPaths def getSectionAndSummary(soup): if id is None: return 'any', '' summary = soup.findAll('div', attrs={'id':id}) text = summary[0].findAll(lambda tag: hasattr(tag,'string') and tag.string is not None) #for t in text: #if t.name in ['h1','h2','h3','h4','strong']: #print '***',t.string #else: #print '---',t.string if text: summary = ' .'.join( [t.string for t in text] ) section = 'any' summary = re.sub( '\s+', ' ', summary) #print 'storing', section, ',',summary return section, summary[:300] return 'any', '' def gotPage(page, factory): u = n_url.URL.fromString(factory.url) if not page.startswith('<!DOCTYPE'): # Don't like the look of this url so I won't try and process it return factory.url, [] soup = BeautifulSoup(page) title = getTitle(soup) content = getIndexableContent(soup) newPaths = getLinkedPages(soup, u, domains) section, summary = getSectionAndSummary(soup) #print '****' #print '>> URL', factory.url #print '>> content', content args = u.queryList() query = '' for arg in args: if arg[0] in ['page']: query = '?page=%s'%arg[1] key = '/' + '/'.join(u.pathList()) + query if query == '': hypeIndex.addDocument(key, title, section, summary, content) return key, newPaths urlparse.clear_cache() factory = getPage(urlToGet) d = factory.deferred d.addCallback(gotPage, factory) return d
import urlparse import jsonrpclib jsonrpclib.config.version = 1.0 import jsonrpclib.SimpleJSONRPCServer import web from decoder import Mark4, Mark5B import scp import config urlparse.uses_relative.append('scp') urlparse.uses_netloc.append('scp') urlparse.uses_params.append('scp') urlparse.clear_cache() def vex2time(str): tupletime = time.strptime(str, "%Yy%jd%Hh%Mm%Ss") return time.mktime(tupletime) def time2vex(secs): tupletime = time.gmtime(secs) return time.strftime("%Yy%jd%Hh%Mm%Ss", tupletime) os.environ['TZ'] = 'UTC' time.tzset()
if len(sys.argv) == 1: print('Usage: time_urlparse_file <filename>') exit(0) filename = sys.argv[1] total_url_count = 0 total_urllib = 0 total_f = 0 total_fc = 0 total_fcb = 0 curlparse.clear_cache() urlparse_fast.clear_cache() urlparse_urllib.clear_cache() start_all = time.time() for url in open(filename, 'r'): url_bytes = url.encode('utf-8') total_url_count += 1 start = time.time() urlparse_fast.urlparse(url) total_f += time.time() - start start = time.time() curlparse.urlparse(url) total_fc += time.time() - start