def urlretrieve(self, url, filename, reporthook, ssl_ignore_cert=False): """ Retrieve a URL into a temporary location on disk. Requires a URL argument. If a filename is passed, it is used as the temporary file location. The reporthook argument should be a callable that accepts a block number, a read size, and the total file size of the URL target. The data argument should be valid URL encoded data. If a filename is passed and the URL points to a local resource, the result is a copy from local file to new file. Returns a tuple containing the path to the newly created data file as well as the resulting HTTPMessage object. """ url_type, path = splittype(url) if ssl_ignore_cert: # ignore certificate ssl_ctx = ssl._create_unverified_context() else: # let the library does the work ssl_ctx = None msg = 'Opening %s ...' % (url, ) print(msg, end='\r') with contextlib.closing(urlopen(url, None, context=ssl_ctx)) as fp: print('%*s' % ( len(msg), '', ), end='\r') headers = fp.info() with open(filename, 'wb') as tfp: result = filename, headers bs = 1024 * 8 size = -1 read = 0 blocknum = 0 if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 reporthook(blocknum, bs, size) if size >= 0 and read < size: raise ContentTooShortError( "retrieval incomplete: got only %i out of %i bytes" % (read, size), result) return result
def safe_retrieve(url, filename=None, reporthook=None, data=None, maxtries=5, r_range=None): if maxtries < -1: raise ValueError('maxtries must be at least equal with -1') url_type, path = splittype(url) with contextlib.closing(urlopen(url, data)) as fp: headers = fp.info() if not r_range is None: try: headers["Range"] = "bytes=%d-%d" % r_range except TypeError: raise ValueError('r_range argument must be a tuple of two int : (start, end)') # Just return the local path and the "headers" for file:// # URLs. No sense in performing a copy unless requested. if url_type == "file" and not filename: return os.path.normpath(path), headers # Handle temporary file setup. if filename: tfp = open(filename, 'wb') else: tfp = tempfile.NamedTemporaryFile(delete=False) filename = tfp.name _url_tempfiles.append(filename) with tfp: result = filename, headers bs = 1024*8 size = -1 read = 0 blocknum = 0 if "content-length" in headers: size = int(headers["Content-Length"]) elif r_range is not None: size = r_range[1] if reporthook: reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) if size >= 0 and read < size: if maxtries > 0 or maxtries == -1: safe_retrieve(url, filename, reporthook, data, maxtries if maxtries == -1 else maxtries-1,r_range=(read, size)) else: raise ContentTooShortError("retrieval incomplete: got only %i out of %i bytes"% (read, size), result) return result
def urlretrieve(self, url, filename, reporthook, ssl_ignore_cert=False): """ Retrieve a URL into a temporary location on disk. Requires a URL argument. If a filename is passed, it is used as the temporary file location. The reporthook argument should be a callable that accepts a block number, a read size, and the total file size of the URL target. The data argument should be valid URL encoded data. If a filename is passed and the URL points to a local resource, the result is a copy from local file to new file. Returns a tuple containing the path to the newly created data file as well as the resulting HTTPMessage object. """ url_type, path = splittype(url) if ssl_ignore_cert: # ignore certificate ssl_ctx = ssl._create_unverified_context() else: # let the library does the work ssl_ctx = None msg = 'Opening %s ...' % (url, ) print(msg, end='\r') with contextlib.closing(urlopen(url, None, context=ssl_ctx)) as fp: print('%*s' % (len(msg), '', ), end = '\r') headers = fp.info() with open(filename, 'wb') as tfp: result = filename, headers bs = 1024*8 size = -1 read = 0 blocknum = 0 if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 reporthook(blocknum, bs, size) if size >= 0 and read < size: raise ContentTooShortError( "retrieval incomplete: got only %i out of %i bytes" % (read, size), result) return result
def extractReqTarget(full_link): if "qunar" not in str(full_link): return None if "qrt=" in str(full_link): return full_link.partition('qrt=')[2] if "html.ng" in str(full_link): return 'qde' proto, rest = ur.splittype(full_link) res, rest = ur.splithost(rest) return None if not res else res
def load_timestream(file_path): '''Load a time stream from either a text file, HDF5 file, or URL The argument "file_path" can be one of the following: 1. A path to a text file; 2. A path to an HDF5 file; 3. An URL pointing to the JSON record of a test; 4. An URL pointing to an HDF5 file. Return a pair consisting of a dictionary containing the medatada and a Timestream object.''' if not urlreq.splittype(file_path)[0]: # Local path ext = os.path.splitext(file_path)[1] if ext.lower() == '.txt': return None, load_text_file(file_path) else: return load_hdf5_file(file_path) else: # URL req = urlreq.urlopen(file_path) content_type = req.info().get_content_type() # We are *forced* to create a named temporary file and close it # before reading, because h5py does not support reading from # file-like objects like BytesIO or an already opened TemporaryFile with NamedTemporaryFile(suffix='h5', delete=False) as h5_file: h5_file_name = h5_file.name if content_type == 'application/json': metadata = json.loads(req.read().decode('utf-8')) download_test(file_path, metadata, h5_file) elif content_type == 'application/hdf5': copyfileobj(req, h5_file) else: raise ValueError( 'unknown content type: "{0}"'.format(content_type)) result = load_hdf5_file(h5_file_name)[1] os.remove(h5_file_name) return metadata, result
print(url) print('splitattr') x = urllib2.splitattr(url) print(x) print('splithost') x = urllib2.splithost(url) print(x) print('splitpasswd') x = urllib2.splitpasswd(url) print(x) print('splitport') x = urllib2.splitport(url) print(x) print('splittype') x = urllib2.splittype(url) print(x) print('splituser') x = urllib2.splituser(url) print(x) print('splitvalue') x = urllib2.splitvalue(url) print(x) from urllib.parse import urlparse #from urllib.urlparse import urlparse print('dir(urllib2.urlparse)') #x = dir(urllib2.urlparse) x = dir(urllib.parse.urlparse)