def download_url(url, filename, headers, args): """ Downloads the given url in filename. """ if is_youtube_url(url): download_youtube_url(url, filename, headers, args) else: import ssl # FIXME: Ugly hack for coping with broken SSL sites: # https://www.cs.duke.edu/~angl/papers/imc10-cloudcmp.pdf # # We should really ask the user if they want to stop the downloads # or if they are OK proceeding without verification. # # Note that skipping verification by default could be a problem for # people's lives if they happen to live ditatorial countries. # # Note: The mess with various exceptions being caught (and their # order) is due to different behaviors in different Python versions # (e.g., 2.7 vs. 3.4). try: urlretrieve(url, filename) except ssl.SSLError as e: compat_print('[warning] Got SSL error: %s' % e) raise e except HTTPError as e: compat_print('[warning] Got HTTP error: %s' % e) raise e except URLError as e: compat_print('[warning] Got URL error: %s' % e) raise e except IOError as e: compat_print('[warning] Got a connection error: %s' % e) raise e
def _find_jar(url=None): """ Finds the location of loci_tools.jar, if necessary download it to a writeable location. """ for loc in _gen_jar_locations(): if os.path.isfile(os.path.join(loc, 'loci_tools.jar')): return os.path.join(loc, 'loci_tools.jar') warn('loci_tools.jar not found, downloading') for loc in _gen_jar_locations(): # check if dir exists and has write access: if os.path.exists(loc) and os.access(loc, os.W_OK): break # if directory is pims and it does not exist, so make it (if allowed) if os.path.basename(loc) == 'pims' and \ os.access(os.path.dirname(loc), os.W_OK): os.mkdir(loc) break else: raise IOError('No writeable location found. In order to use the ' 'Bioformats reader, please download ' 'loci_tools.jar to the pims program folder or one of ' 'the locations provided by _gen_jar_locations().') from six.moves.urllib.request import urlretrieve if url is None: url = ('http://downloads.openmicroscopy.org/bio-formats/5.1.0/' + 'artifacts/loci_tools.jar') urlretrieve(url, os.path.join(loc, 'loci_tools.jar')) return os.path.join(loc, 'loci_tools.jar')
def fetch_data(dest_dir='.', clobber=False, url=DATA_URL): """ Download data from NCBI required to generate local taxonomy database. Default url is ncbi.DATA_URL * dest_dir - directory in which to save output files (created if necessary). * clobber - don't download if False and target of url exists in dest_dir * url - url to archive; default is ncbi.DATA_URL Returns (fname, downloaded), where fname is the name of the downloaded zip archive, and downloaded is True if a new files was downloaded, false otherwise. see ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump_readme.txt """ dest_dir = os.path.abspath(dest_dir) try: os.mkdir(dest_dir) except OSError: pass fout = os.path.join(dest_dir, os.path.split(url)[-1]) if os.access(fout, os.F_OK) and not clobber: downloaded = False logging.info(fout + ' exists; not downloading') else: downloaded = True logging.info('downloading {} to {}'.format(url, fout)) request.urlretrieve(url, fout) return (fout, downloaded)
def download_file(url, download_path): """ Download a file from a resource URL to the given location :param url: URL of the file to download :param download_path: location where the file should be saved """ # Extract the filename from the URL parsed = urlparse(url) filename = basename(parsed.path) # Ensure the output directory exists if not os.path.exists(download_path): os.makedirs(download_path) # Get a temporary file path for the compressed file download downloaded_file = os.path.join(tempfile.gettempdir(), filename) # Download the file urlretrieve(url, downloaded_file, reporthook=progress_bar_wrapper) # Move the file to the destination folder destination_path = os.path.join(download_path, filename) move(downloaded_file, destination_path)
def download_and_decompress(url, download_path, verbose=True): """ Download an archive from a resource URL and decompresses/unarchives to the given location :param url: URL of the compressed file to download :param download_path: location where the file should be extracted """ # Extract the filename from the URL parsed = urlparse(url) filename = basename(parsed.path) # Ensure the output directory exists if not os.path.exists(download_path): os.makedirs(download_path) # Get a temporary file path for the compressed file download downloaded_file = os.path.join(tempfile.gettempdir(), filename) # Download the file if verbose: urlretrieve(url, downloaded_file, reporthook=progress_bar_wrapper) else: urlretrieve(url, downloaded_file) # Decompress and extract all files to the specified local path tar = tarfile.open(downloaded_file, "r") tar.extractall(download_path) tar.close() # Remove the downloaded file os.remove(downloaded_file)
def get_rdr_some_label(kind, obsid): """Download `some` PRODUCT_ID label for `obsid`. Note ---- The RED channel is also called the B&W channel on the HiRISE website. Parameters ---------- kind : {'RED', 'COLOR'} String that determines the kind of color looking for. obsid : str HiRISE obsid in the standard form of ESP_012345_1234 Returns ------- None Storing the label file in the `labels_root` folder. """ pid = PRODUCT_ID(obsid) pid.kind = kind savepath = labels_root() / Path(pid.label_fname) savepath.parent.mkdir(exist_ok=True) print("Downloading\n", pid.label_url, 'to\n', savepath) try: urlretrieve(pid.label_url, str(savepath)) except HTTPError as e: print(e)
def _download_database_template( galaxy_root, database_location, latest=False, galaxy_sqlite_database=None ): if galaxy_sqlite_database is not None: shutil.copyfile(galaxy_sqlite_database, database_location) return True if latest or not galaxy_root: template_url = DOWNLOADS_URL + urlopen(LATEST_URL).read() urlretrieve(template_url, database_location) return True newest_migration = _newest_migration_version(galaxy_root) download_migration = None for migration in DOWNLOADABLE_MIGRATION_VERSIONS: if newest_migration > migration: download_migration = migration break if download_migration: download_name = "db_gx_rev_0%d.sqlite" % download_migration download_url = DOWNLOADS_URL + download_name urlretrieve(download_url, database_location) return True else: return False
def download_zip(url, name=None, check_dir=None): """Download and unzip zip file from url to $XTAS_DATA. Does nothing if $XTAS_DATA/check_dir exists. Parameters ---------- url : string URL of resource. name : string Used by the logger, to display "Downloading [name]". check_dir : string Name of directory to which the resource is unzipped. Derived from the URL by default. """ if check_dir is None: check_dir = os.path.basename(url) if check_dir.endswith('.zip'): check_dir = check_dir[:-4] if name is None: name = url home = make_data_home() check_dir = os.path.join(home, check_dir) # XXX race condition with multiple workers if not os.path.exists(check_dir): with NamedTemporaryFile() as temp: logger.info("Downloading %s" % name) urlretrieve(url, temp.name, reporthook=progress) with ZipFile(temp.name) as z: z.extractall(path=home) return check_dir
def download_onnx_model(model_name, zoo_dir, use_cache=True, only_local=False): model_dir = os.path.join(zoo_dir, model_name) if os.path.exists(model_dir): if use_cache: upload_onnx_model(model_name, zoo_dir, backup=True, only_local=only_local) return else: shutil.rmtree(model_dir) url = 'https://s3.amazonaws.com/download.onnx/models/latest/{}.tar.gz'.format(model_name) download_file = tempfile.NamedTemporaryFile(delete=False) try: download_file.close() print('Downloading ONNX model {} from {} and save in {} ...\n'.format( model_name, url, download_file.name)) urlretrieve(url, download_file.name) with tarfile.open(download_file.name) as t: print('Extracting ONNX model {} to {} ...\n'.format(model_name, zoo_dir)) t.extractall(zoo_dir) except Exception as e: print('Failed to download/backup data for ONNX model {}: {}'.format(model_name, e)) if not os.path.exists(model_dir): os.makedirs(model_dir) finally: os.remove(download_file.name) if not only_local: upload_onnx_model(model_name, zoo_dir, backup=True, only_local=only_local)
def download_attachments(output_path, urls): """Downloads WordPress attachments and returns a list of paths to attachments that can be associated with a post (relative path to output directory). Files that fail to download, will not be added to posts""" locations = [] for url in urls: path = urlparse(url).path #teardown path and rebuild to negate any errors with #os.path.join and leading /'s path = path.split('/') filename = path.pop(-1) localpath = '' for item in path: if sys.platform != 'win32' or ':' not in item: localpath = os.path.join(localpath, item) full_path = os.path.join(output_path, localpath) if not os.path.exists(full_path): os.makedirs(full_path) print('downloading {}'.format(filename)) try: urlretrieve(url, os.path.join(full_path, filename)) locations.append(os.path.join(localpath, filename)) except (URLError, IOError) as e: #Python 2.7 throws an IOError rather Than URLError logger.warning("No file could be downloaded from %s\n%s", url, e) return locations
def download_url(url, filename, headers, args): """ Downloads the given url in filename. """ if is_youtube_url(url): download_youtube_url(url, filename, headers, args) else: import ssl # FIXME: Ugly hack for coping with broken SSL sites: # https://www.cs.duke.edu/~angl/papers/imc10-cloudcmp.pdf # # We should really ask the user if they want to stop the downloads # or if they are OK proceeding without verification. # # Note that skipping verification by default could be a problem for # people's lives if they happen to live ditatorial countries. # # Note: The mess with various exceptions being caught (and their # order) is due to different behaviors in different Python versions # (e.g., 2.7 vs. 3.4). try: urlretrieve(url, filename) except Exception as e: logging.warn('Got SSL/Connection error: %s', e) if not args.ignore_errors: logging.warn('Hint: if you want to ignore this error, add ' '--ignore-errors option to the command line') raise e else: logging.warn('SSL/Connection error ignored: %s', e)
def maybe_download(filename,expected_bytes,force=False): ''' Download file if file not exsits. @param: filename: Name of file to download. expected: The size of file should download. force: Download without whether exsits. ''' #destinate file path dest_filename = os.path.join(data_root,filename) #if download file if force or not os.path.exists(dest_filename): print('Attempting to download:{}'.format(filename)) urlretrieve(url+filename,dest_filename,reporthook = download_progress_hook) print('\nDownload Complete!') else: print('File {} exists!'.format(filename)) #is file Complete statinfo = os.stat(dest_filename) if expected_bytes == statinfo.st_size: print('File {} is downloaded completely!'.format(filename)) else: raise Exception( 'File {} is not downloaded completely!'.format(filename) ) return dest_filename
def download_one(url, output_file, skip_existing=True): """Download a single URL. Parameters ---------- url : str URL to download. output_file : str Path to save the downloaded file. skip_existing : bool, default=True If True, down download URLs that exist in the output directory. Returns ------- success : bool True if the file was downloaded successfully. """ if os.path.exists(output_file) and skip_existing: print(" Skipping (exists): {}".format(url)) return print("[{}] Fetching: {}".format(time.asctime(), url)) try: surl = urlparse.quote(url, safe=':./') urlrequest.urlretrieve(surl, output_file) except urlerror.HTTPError: logger.warning("FAILED to download file at: {}".format(surl)) logger.warning("\nOriginal link: {}\nOutput file:{}\n".format( url, output_file)) logger.warning("Skipping...") finally: return os.path.exists(output_file)
def k8s_install_cli(client_version='latest', install_location=None): """ Downloads the kubectl command line from Kubernetes """ if client_version == 'latest': version = urlopen('https://storage.googleapis.com/kubernetes-release/release/stable.txt').read() client_version = version.decode('UTF-8').strip() file_url = '' system = platform.system() base_url = 'https://storage.googleapis.com/kubernetes-release/release/{}/bin/{}/amd64/{}' if system == 'Windows': file_url = base_url.format(client_version, 'windows', 'kubectl.exe') elif system == 'Linux': # TODO: Support ARM CPU here file_url = base_url.format(client_version, 'linux', 'kubectl') elif system == 'Darwin': file_url = base_url.format(client_version, 'darwin', 'kubectl') else: raise CLIError('Proxy server ({}) does not exist on the cluster.'.format(system)) logger.warning('Downloading client to %s from %s', install_location, file_url) try: urlretrieve(file_url, install_location) os.chmod(install_location, os.stat(install_location).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) except IOError as err: raise CLIError('Connection error while attempting to download client ({})'.format(err))
def load_cifar10(datadir="cifar-10-batches-py"): # CIFAR-10 データセットがなければダウンロードする if os.path.exists(datadir) == False: print("Downloading cifar-10...") request.urlretrieve("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz","cifar10.tar.gz") tar = tarfile.open("cifar10.tar.gz") tar.extractall() tar.close() train_data = [] train_target = [] # 訓練データをロード for i in range(1, 6): d = unpickle("%s/data_batch_%d" % (datadir, i)) train_data.extend(d["data"]) train_target.extend(d["labels"]) # テストデータをロード d = unpickle("%s/test_batch" % (datadir)) test_data = d["data"] test_target = d["labels"] # データはfloat32、ラベルはint32のndarrayに変換 train_data = np.array(train_data, dtype=np.float32) train_target = np.array(train_target, dtype=np.int32) test_data = np.array(test_data, dtype=np.float32) test_target = np.array(test_target, dtype=np.int32) # 画像のピクセル値を0-1に正規化 train_data /= 255.0 test_data /= 255.0 return train_data, test_data, train_target, test_target
def _cache_download(url, filename, sha256sum=None): """Returns local path to cached copy of URL using given filename.""" cache = os.environ.get("DOWNLOAD_CACHE", "./download_cache/") # TODO - expose this as a command line option if not os.path.isdir(cache): os.mkdir(cache) local = os.path.join(cache, filename) if not os.path.isfile(local): # Must download it... try: # TODO - log this nicely... sys.stderr.write("Downloading %s to %r\n" % (url, local)) urlretrieve(url, local) except URLError: # Most likely server is down, could be bad URL in XML action: raise RuntimeError("Unable to download %s" % url) except FTPErrors: # Most likely server is down, could be bad URL in XML action: raise RuntimeError("Unable to download %s" % url) # Verifying the checksum is slow, only do this on a fresh # download. Assume locally cached files are already OK. if sha256sum: # TODO - log this nicely... sys.stderr.write("Verifying checksum for %s\n" % filename) filehash = subprocess.check_output(['shasum', '-a', '256', local])[0:64].strip() if filehash != sha256sum: raise RuntimeError("Checksum failure for %s, got %r but wanted %r" % (local, filehash, sha256sum)) return local
def download_dbsnp_vcf(dbsnp_build=None,genome_build=None,url=None,outpath=None): """ Download the NCBI dbSNP VCF for a given human genome build and dbSNP build Args: dbsnp_build: b147 genome_build: GRCh37p13 url: Direct URL to file, e.g. ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b147_GRCh37p13/VCF/00-All.vcf.gz outpath: Constructed from genome_build and dbsnp_build. If not given, a random filename will be generated. Returns: Name of file into which we saved the data (will be constructed from builds, or random name) """ if url is None: if not genome_build.startswith("GRC"): raise ValueError("Genome build should begin with GRC") if not dbsnp_build.startswith("b"): raise ValueError("dbSNP build should look like b147, b148, etc.") url = NCBI_VCF_TEMPLATE_URL.format(dbsnp_build,genome_build) if outpath is None: if genome_build is None or dbsnp_build is None: outpath = "dbsnp.vcf.gz" else: outpath = "human_9606_{}_{}_All.vcf.gz".format(dbsnp_build,genome_build) with tqdm(unit='B',unit_scale=True,miniters=1,desc=url.split('/')[-1]) as t: urlcleanup() urlretrieve(url,filename=outpath,reporthook=tqdm_hook(t),data=None) return outpath
def get_mnist_file(fpath, origin): datadir = os.path.dirname(fpath) if not os.path.exists(datadir): os.makedirs(datadir) try: f = open(fpath) except: print('Downloading data from', origin) global progbar progbar = None def dl_progress(count, block_size, total_size): global progbar if progbar is None: progbar = Progbar(total_size) else: progbar.update(count*block_size) urlretrieve(origin, fpath + '.gz', dl_progress) progbar = None fin = gzip.open(fpath + '.gz', 'rb') fout = open(fpath, 'wb') fout.write(fin.read()) fin.close() fout.close() return fpath
def _download_log_files( client, resource_group_name, server_name, file_name): """ Download log file(s) of a given server to current directory. :param resource_group_name: The name of the resource group that contains the resource. You can obtain this value from the Azure Resource Manager API or the portal. :type resource_group_name: str :param server_name: Name of the server. :type server_name: str :param file_name: Space separated list of log filenames on the server to download. :type filename_contains: str """ from six.moves.urllib.request import urlretrieve # pylint: disable=import-error # list all files files = client.list_by_server(resource_group_name, server_name) for f in files: if f.name in file_name: urlretrieve(f.url, f.name)
def dcos_install_cli(install_location=None, client_version='1.8'): """ Downloads the dcos command line from Mesosphere """ system = platform.system() if not install_location: raise CLIError("No install location specified and it could not be determined from the current platform '{}'".format(system)) base_url = 'https://downloads.dcos.io/binaries/cli/{}/x86-64/dcos-{}/{}' if system == 'Windows': file_url = base_url.format('windows', client_version, 'dcos.exe') elif system == 'Linux': # TODO Support ARM CPU here file_url = base_url.format('linux', client_version, 'dcos') elif system == 'Darwin': file_url = base_url.format('darwin', client_version, 'dcos') else: raise CLIError('Proxy server ({}) does not exist on the cluster.'.format(system)) logger.info('Downloading client to %s', install_location) try: urlretrieve(file_url, install_location) os.chmod(install_location, os.stat(install_location).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) except IOError as err: raise CLIError('Connection error while attempting to download client ({})'.format(err))
def retrieve_file_from_url(self, url, force=False): """ Retrieve a file from FTP server. .. note:: urlretrieve has a better API for error handling than ftp.retrbinary :param bool force: overwrite local files :param str url: file url :return bool: whether retrieved """ remote_path, basename = url.rsplit('/', 1) filename = path.join(self.local_data, self.assembly, basename) if not force and self.check_local(basename.replace('.gz', '')): log.info('{} available, aborting retrieval.'.format(filename)) return False else: try: urlretrieve('ftp://' + self.base_url + url, filename) log.info('{} retrieval complete.'.format(filename)) return True except URLError as e: log.error('Error retrieving {}: \n{}'.format( 'ftp://' + self.base_url + url, e)) raise e
def dcos_install_cli(install_location=None, client_version="1.8"): """ Downloads the dcos command line from Mesosphere """ system = platform.system() if not install_location: raise CLIError( "No install location specified and it could not be determined from the current platform '{}'".format(system) ) base_url = "https://downloads.dcos.io/binaries/cli/{}/x86-64/dcos-{}/{}" if system == "Windows": file_url = base_url.format("windows", client_version, "dcos.exe") elif system == "Linux": # TODO Support ARM CPU here file_url = base_url.format("linux", client_version, "dcos") elif system == "Darwin": file_url = base_url.format("darwin", client_version, "dcos") else: raise CLIError("Proxy server ({}) does not exist on the cluster.".format(system)) logger.info("Downloading client to %s", install_location) try: urlretrieve(file_url, install_location) except IOError as err: raise CLIError("Connection error while attempting to download client ({})".format(err))
def get_file(fpath, origin, untar=False): datadir = os.path.dirname(fpath) if not os.path.exists(datadir): os.makedirs(datadir) if not os.path.exists(fpath): print('Downloading data from', origin) global progbar progbar = None def dl_progress(count, block_size, total_size): global progbar if progbar is None: progbar = Progbar(total_size) else: progbar.update(count*block_size) urlretrieve(origin, fpath, dl_progress) progbar = None if untar: tfile = tarfile.open(fpath, 'r:gz') names = tfile.getnames() dirname = names[0] not_exists = [int(not os.path.exists("{}/{}".format(datadir, fname))) for fname in names] if sum(not_exists) > 0: print('Untaring file...') tfile.extractall(path=datadir) else: print('Files already untarred') tfile.close() return "{}/{}".format(datadir, dirname)
def download_numpy_wheel(): base_url = os.getenv('NUMPY_URL') if base_url is None: raise ValueError('NUMPY_URL environment variable is missing.') version = '1.10.4+mkl' py = 'cp{0[0]}{0[1]}'.format(sys.version_info) if py not in {'cp27', 'cp34', 'cp35'}: print('NumPy wheel not available for {}'.format(py)) return None bits = struct.calcsize('P') * 8 if bits == 32: arch = 'win32' elif bits == 64: arch = 'win_amd64' else: raise ValueError("Couldn't determine 32/64 bits.") filename = 'numpy-{}-{}-none-{}.whl'.format(version, py, arch) directory = 'astrodynamics-numpy-wheels' os.mkdir(directory) filepath = os.path.join(directory, filename) url = base_url + filename # Disable SSL. Shouldn't do this ever. This is just a script. ssl._create_default_https_context = ssl._create_unverified_context urlretrieve(url, filepath) return filepath
def build_image_factory(): """Downloads the //github.com/barseghyanartur/delusionalinsanity.images/archive/latest.zip locally, unpacks it to grab the images. Then makes a list of all the images. :return list: List of relative paths to images. """ try: shutil.rmtree( os.path.join(settings.MEDIA_ROOT, 'delusionalinsanity.images-latest') ) except Exception as err: logger.debug(err) try: download_local = os.path.join( settings.MEDIA_ROOT, 'delusionalinsanity_images_latest.zip' ) request.urlretrieve( 'https://github.com/barseghyanartur/delusionalinsanity.images' '/archive/latest.zip', download_local ) zfile = zipfile.ZipFile(download_local) names = zfile.namelist() for name in names: try: dirname, filename = os.path.split(name) if not filename: continue dirname = os.path.join(settings.MEDIA_ROOT, dirname) if not os.path.exists(dirname): os.mkdir(dirname) fd = open(os.path.join(settings.MEDIA_ROOT, name), "w") fd.write(zfile.read(name)) fd.close() except Exception as e: logger.debug(e) source_dir = os.path.join( settings.MEDIA_ROOT, 'delusionalinsanity.images-latest', 'images' ) images_dir = os.path.join(settings.MEDIA_ROOT, NEWS_IMAGES_STORAGE_PATH) shutil.move(source_dir, images_dir) images = [os.path.join(images_dir, f) for f in os.listdir(images_dir)] return [fix_image(i) for i in images] except Exception as err: logger.debug(err) return []
def get_bioformats_file(filename, filepath='', url=''): if url == '': url = 'http://www.loci.wisc.edu/files/software/data/' + filename fn = os.path.join(filepath, filename) urlretrieve(url, fn) with ZipFile(fn) as zf: zf.extractall(filepath) os.remove(fn)
def download(filename): destination_file = "data/" + filename if not os.path.exists(destination_file): print("Dowloading ", filename, "into ", destination_file) urlretrieve(url + filename, destination_file) else: print "File already exists: %s" %filename return destination_file
def fetch_data(data_url, cache_path=None, download_if_missing=True): """ Fetch data and return local filename Parameters ---------- cache_path : str (optional) Specify a path to cache the datasets. If not specified, this will cache the downloaded data to the current working directory. download_if_missing : bool (optional) If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. data_url : str (optional) Path to the remote data file. Returns ------- filename : str The path to the local HDF5 data file. """ if cache_path is None: cache_path = os.getcwd() else: cache_path = os.path.expanduser(os.path.abspath(cache_path)) cache_file = os.path.join(cache_path, os.path.basename(data_url)) try: # how many bytes are we expecting url = urlopen(data_url) meta = url.info() expected_bytes = int(meta['Content-Length']) except URLError as e: if os.path.exists(cache_file): print("Data file exists but unable to verify against remote file.") return cache_file else: print("Local file not found and unable to connect to remote file! Do " "you have an internet connection?") raise e if (os.path.exists(cache_file) and os.stat(cache_file).st_size != expected_bytes) \ or not os.path.exists(cache_file) or not os.path.isfile(cache_file): urlretrieve(data_url, cache_file) received_bytes = os.stat(cache_file).st_size if received_bytes != expected_bytes: raise IOError("Download error: size expected = {} bytes, size received = {} bytes" .format(expected_bytes, received_bytes)) print("Data downloaded and verified.") else: print("Data file already exists and is verified.") return cache_file
def fetch(args): check_or_set_version(args) url = newest_download_url(args) if not url: print("unable to find a source release for {0}!".format(args.name)) sys.exit(1) print('downloading package {0}-{1}...'.format(args.name, args.version)) print('from {0}'.format(url['url'])) urlretrieve(url['url'], url['filename'])
def get_file(fname, origin, untar=False): datadir_base = os.path.expanduser(os.path.join('~', '.keras')) if not os.access(datadir_base, os.W_OK): datadir_base = os.path.join('/tmp', '.keras') datadir = os.path.join(datadir_base, 'datasets') if not os.path.exists(datadir): os.makedirs(datadir) if untar: untar_fpath = os.path.join(datadir, fname) fpath = untar_fpath + '.tar.gz' else: fpath = os.path.join(datadir, fname) if not os.path.exists(fpath): print('Downloading data from', origin) global progbar progbar = None def dl_progress(count, block_size, total_size): global progbar if progbar is None: progbar = Progbar(total_size) else: progbar.update(count*block_size) error_msg = 'URL fetch failure on {}: {} -- {}' try: try: urlretrieve(origin, fpath, dl_progress) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except HTTPError as e: raise Exception(error_msg.format(origin, e.code, e.msg)) except (Exception, KeyboardInterrupt) as e: if os.path.exists(fpath): os.remove(fpath) raise progbar = None if untar: if not os.path.exists(untar_fpath): print('Untaring file...') tfile = tarfile.open(fpath, 'r:gz') try: tfile.extractall(path=datadir) except (Exception, KeyboardInterrupt) as e: if os.path.exists(untar_fpath): if os.path.isfile(untar_fpath): os.remove(untar_fpath) else: shutil.rmtree(untar_fpath) raise tfile.close() return untar_fpath return fpath
def download_com(self): # download converter urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/AC3DownMixStatus.py", "/tmp/AC3DownMixStatus.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/AlwaysTrue.py", "/tmp/AlwaysTrue.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/Bitrate2.py", "/tmp/Bitrate2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/CaidBar.py", "/tmp/CaidBar.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/CaidInfo2.py", "/tmp/CaidInfo2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/CamdInfo3.py", "/tmp/CamdInfo3.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ConverterRotator.py", "/tmp/ConverterRotator.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/CpuUsage.py", "/tmp/CpuUsage.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/DiskInfo.py", "/tmp/DiskInfo.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/EcmInfoLine.py", "/tmp/EcmInfoLine.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/EmuName.py", "/tmp/EmuName.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/EventName2.py", "/tmp/EventName2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ExtraNumText.py", "/tmp/ExtraNumText.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/FanTempInfo.py", "/tmp/FanTempInfo.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/FlashingDotClock.py", "/tmp/FlashingDotClock.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/FrontendInfo2.py", "/tmp/FrontendInfo2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/IsNet.py", "/tmp/IsNet.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/MemoryInfo.py", "/tmp/MemoryInfo.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ModuleControl.py", "/tmp/ModuleControl.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/MovieInfo2.py", "/tmp/MovieInfo2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ProgressDiskSpaceInfo.py", "/tmp/ProgressDiskSpaceInfo.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/RefString.py", "/tmp/RefString.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/RouteInfo.py", "/tmp/RouteInfo.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ServiceInfo2.py", "/tmp/ServiceInfo2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ServiceInfoEX.py", "/tmp/ServiceInfoEX.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ServiceName2.py", "/tmp/ServiceName2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ServiceName2.ref", "/tmp/ServiceName2.ref") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/ServiceOrbitalPosition2.py", "/tmp/ServiceOrbitalPosition2.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/TestConnection.py", "/tmp/TestConnection.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/TunerBar.py", "/tmp/TunerBar.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Converter/WiFiInfo.py", "/tmp/WiFiInfo.py") # download renderer urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/AnimatedWeatherPixmap.py", "/tmp/AnimatedWeatherPixmap.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/AnimatedMoonPixmap.py", "/tmp/AnimatedMoonPixmap.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/LabelDuoColors.py", "/tmp/LabelDuoColors.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/MovieCover.py", "/tmp/MovieCover.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/MovieRating.py", "/tmp/MovieRating.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/PiconUni.py", "/tmp/PiconUni.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/RendVolumeText.py", "/tmp/RendVolumeText.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/RendVolumeTextP.py", "/tmp/RendVolumeTextP.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/RunningText.py", "/tmp/RunningText.py") urlretrieve( "https://raw.githubusercontent.com/Sirius0103/enigma2-components/master/python/Components/Renderer/Watches.py", "/tmp/Watches.py") # end self.install_com()
import tarfile import os from six.moves.urllib import request url_dir = 'https://www.cs.toronto.edu/~kriz/' file_name = 'cifar-10-python.tar.gz' save_dir = 'dataset' tar_path = os.path.join(save_dir, file_name) if __name__ == '__main__': if not os.path.exists(save_dir): os.makedirs(save_dir) if os.path.exists(tar_path): print('{:s} already downloaded.'.format(file_name)) else: print('Downloading {:s}...'.format(file_name)) request.urlretrieve('{:s}{:s}'.format(url_dir, file_name), tar_path) print('Extracting files...') with tarfile.open(tar_path, 'r:gz') as f: f.extractall(save_dir)
import pickle from six.moves.urllib.request import urlretrieve import numpy as np import h5py import os import sys bs_data_dir = os.environ.get('BRAINSTORM_DATA_DIR', '.') url = 'http://deeplearning.net/data/mnist/mnist.pkl.gz' mnist_file = os.path.join(bs_data_dir, 'mnist.pkl.gz') hdf_file = os.path.join(bs_data_dir, 'MNIST.hdf5') print("Using data directory:", bs_data_dir) if not os.path.exists(mnist_file): print("Downloading MNIST data ...") urlretrieve(url, mnist_file) print("Done.") print("Extracting MNIST data ...") with gzip.open(mnist_file, 'rb') as f: if sys.version_info < (3, ): ds = pickle.load(f) else: ds = pickle.load(f, encoding='latin1') print("Done.") train_inputs, train_targets = \ ds[0][0].reshape((1, 50000, 28, 28, 1)), ds[0][1].reshape((1, 50000, 1)) valid_inputs, valid_targets = \ ds[1][0].reshape((1, 10000, 28, 28, 1)), ds[1][1].reshape((1, 10000, 1)) test_inputs, test_targets = \
if not os.path.exists(os.environ['temp'] + "\itchiotempdir"): os.makedirs(os.environ['temp'] + "\itchiotempdir") downloadDir = os.environ['temp'] + "\itchiotempdir" chromeOptions = webdriver.ChromeOptions() prefs = {"download.default_directory": downloadDir} chromeOptions.add_experimental_option("prefs", prefs) if os.path.isfile('chrome.ini'): ini = open('chrome.ini', 'r') locationString = ini.read() elif os.path.isfile('chromedriver.exe'): locationString = 'chromedriver.exe' else: response = urlretrieve( 'https://chromedriver.storage.googleapis.com/2.33/chromedriver_win32.zip', 'chromedriver.zip') zip_ref = zipfile.ZipFile("chromedriver.zip", 'r') zip_ref.extractall(owd) zip_ref.close locationString = 'chromedriver.exe' driver = webdriver.Chrome(executable_path=(locationString), chrome_options=chromeOptions) driver.set_window_position(4000, 651) driver.set_page_load_timeout(600) if os.path.isfile("repo.ini"): with open("repo.ini", "r") as myfile:
def get_file(fname, origin, untar=False, md5_hash=None, file_hash=None, cache_subdir='datasets', hash_algorithm='auto', archive_format='auto', cache_dir=None): if cache_dir is None: cache_dir = os.path.join(os.path.expanduser('~'), '.conda') if md5_hash is not None and file_hash is None: file_hash = md5_hash hash_algorithm = 'md5' datadir_base = os.path.expanduser(cache_dir) if not os.access(datadir_base, os.W_OK): datadir_base = os.path.join('/tmp', '.conda') datadir = os.path.join(datadir_base, cache_subdir) os.makedirs(datadir, exist_ok=True) fpath = os.path.join(datadir, fname) if untar: untar_fpath = os.path.join(datadir, fname) fpath = untar_fpath + '.tar.gz' else: fpath = os.path.join(datadir, fname) download = False if os.path.exists(fpath): # File found; verify integrity if a hash was provided. if file_hash is not None: if not validate_file(fpath, file_hash, algorithm=hash_algorithm): print('A local file was found, but it seems to be ' 'incomplete or outdated because the ' + hash_algorithm + ' file hash does not match the original value of ' + file_hash + ' so we will re-download the data.') download = True else: download = True if download: print('Downloading data from', origin) class ProgressTracker(object): # Maintain progbar for the lifetime of download. # This design was chosen for Python 2.7 compatibility. progbar = None def dl_progress(count, block_size, total_size): if ProgressTracker.progbar is None: if total_size == -1: total_size = None ProgressTracker.progbar = Progbar(total_size) else: ProgressTracker.progbar.update(count * block_size) error_msg = 'URL fetch failure on {}: {} -- {}' try: try: urlretrieve(origin, fpath, dl_progress) except HTTPError as e: raise Exception(error_msg.format(origin, e.code, e.msg)) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except (Exception, KeyboardInterrupt) as e: if os.path.exists(fpath): os.remove(fpath) raise ProgressTracker.progbar = None if untar: if not os.path.exists(untar_fpath): _extract_archive(fpath, datadir, archive_format='tar') return untar_fpath return fpath
url = base_url + filename with contextlib.closing(request.urlopen(url)) as f: expected_filesize = int(f.headers["content-length"]) print(expected_filesize) time.sleep(5) widgets = [ '{}: '.format(filename), Percentage(), ' ', Bar(), ' ', ETA(), ' ', FileTransferSpeed() ] progress_bar = ProgressBar(widgets=widgets, maxval=expected_filesize).start() def reporthook(count, blockSize, totalSize): progress_bar.update(min(count * blockSize, totalSize)) filepath = os.path.join(fuel_data_path, filename) request.urlretrieve(url, filepath, reporthook=reporthook) progress_bar.finish() downloaded_filesize = os.path.getsize(filepath) assert expected_filesize == downloaded_filesize, " ".join( ("expected file size is {}, but the actual size of the downloaded file", "is {}.")).format(expected_filesize, downloaded_filesize)
def download_structure(pdb_id, file_type, outdir='', outfile='', only_header=False, force_rerun=False): """Download a structure from the RCSB PDB by ID. Specify the file type desired. Args: pdb_id: PDB ID file_type: pdb, pdb.gz, mmcif, cif, cif.gz, xml.gz, mmtf, mmtf.gz outdir: Optional output directory outfile: Optional output name only_header: If only the header file should be downloaded force_rerun: If the file should be downloaded again even if it exists Returns: str: Path to outfile """ # TODO: keep an eye on https://github.com/biopython/biopython/pull/943 Biopython PR#493 for functionality of this # method in biopython. extra file types have not been added to biopython download yet pdb_id = pdb_id.lower() file_type = file_type.lower() file_types = [ 'pdb', 'pdb.gz', 'mmcif', 'cif', 'cif.gz', 'xml.gz', 'mmtf', 'mmtf.gz' ] if file_type not in file_types: raise ValueError( 'Invalid file type, must be either: pdb, pdb.gz, cif, cif.gz, xml.gz, mmtf, mmtf.gz' ) if file_type == 'mmtf': file_type = 'mmtf.gz' if file_type.endswith('.gz'): gzipped = True else: gzipped = False if file_type == 'mmcif': file_type = 'cif' if only_header: folder = 'header' if outfile: outfile = op.join(outdir, outfile) else: outfile = op.join(outdir, '{}.header.{}'.format(pdb_id, file_type)) else: folder = 'download' if outfile: outfile = op.join(outdir, outfile) else: outfile = op.join(outdir, '{}.{}'.format(pdb_id, file_type)) if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile): if file_type == 'mmtf.gz' or file_type == 'mmtf': mmtf_api = '1.0' download_link = 'http://mmtf.rcsb.org/v{}/full/{}.mmtf.gz'.format( mmtf_api, pdb_id) else: download_link = 'http://files.rcsb.org/{}/{}.{}'.format( folder, pdb_id, file_type) urlretrieve(download_link, outfile) if gzipped: outfile = ssbio.utils.gunzip_file(infile=outfile, outfile=outfile.strip('.gz'), outdir=outdir, delete_original=True, force_rerun_flag=force_rerun) log.debug('{}: saved structure file'.format(outfile)) else: log.debug('{}: structure file already saved'.format(outfile)) return outfile
def get_file(fname, origin, untar=False, md5_hash=None, file_hash=None, cache_subdir='datasets', hash_algorithm='auto', extract=False, archive_format='auto', cache_dir=None): """Downloads a file from a URL if it not already in the cache. By default the file at the url `origin` is downloaded to the cache_dir `~/.keras`, placed in the cache_subdir `datasets`, and given the filename `fname`. The final location of a file `example.txt` would therefore be `~/.keras/datasets/example.txt`. Files in tar, tar.gz, tar.bz, and zip formats can also be extracted. Passing a hash will verify the file after download. The command line programs `shasum` and `sha256sum` can compute the hash. Arguments: fname: Name of the file. If an absolute path `/path/to/file.txt` is specified the file will be saved at that location. origin: Original URL of the file. untar: Deprecated in favor of 'extract'. boolean, whether the file should be decompressed md5_hash: Deprecated in favor of 'file_hash'. md5 hash of the file for verification file_hash: The expected hash string of the file after download. The sha256 and md5 hash algorithms are both supported. cache_subdir: Subdirectory under the Keras cache dir where the file is saved. If an absolute path `/path/to/folder` is specified the file will be saved at that location. hash_algorithm: Select the hash algorithm to verify the file. options are 'md5', 'sha256', and 'auto'. The default 'auto' detects the hash algorithm in use. extract: True tries extracting the file as an Archive, like tar or zip. archive_format: Archive format to try for extracting the file. Options are 'auto', 'tar', 'zip', and None. 'tar' includes tar, tar.gz, and tar.bz files. The default 'auto' is ['tar', 'zip']. None or an empty list will return no matches found. cache_dir: Location to store cached files, when None it defaults to the [Keras Directory](/faq/#where-is-the-keras-configuration-filed-stored). Returns: Path to the downloaded file """ if cache_dir is None: cache_dir = os.path.expanduser(os.path.join('~', '.keras')) if md5_hash is not None and file_hash is None: file_hash = md5_hash hash_algorithm = 'md5' datadir_base = os.path.expanduser(cache_dir) if not os.access(datadir_base, os.W_OK): datadir_base = os.path.join('/tmp', '.keras') datadir = os.path.join(datadir_base, cache_subdir) if not os.path.exists(datadir): os.makedirs(datadir) if untar: untar_fpath = os.path.join(datadir, fname) fpath = untar_fpath + '.tar.gz' else: fpath = os.path.join(datadir, fname) download = False if os.path.exists(fpath): # File found; verify integrity if a hash was provided. if file_hash is not None: if not validate_file(fpath, file_hash, algorithm=hash_algorithm): print('A local file was found, but it seems to be ' 'incomplete or outdated because the ' + hash_algorithm + ' file hash does not match the original value of ' + file_hash + ' so we will re-download the data.') download = True else: download = True if download: print('Downloading data from', origin) class ProgressTracker(object): # Maintain progbar for the lifetime of download. # This design was chosen for Python 2.7 compatibility. progbar = None def dl_progress(count, block_size, total_size): if ProgressTracker.progbar is None: if total_size is -1: total_size = None ProgressTracker.progbar = Progbar(total_size) else: ProgressTracker.progbar.update(count * block_size) error_msg = 'URL fetch failure on {}: {} -- {}' try: try: urlretrieve(origin, fpath, dl_progress) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except HTTPError as e: raise Exception(error_msg.format(origin, e.code, e.msg)) except (Exception, KeyboardInterrupt) as e: if os.path.exists(fpath): os.remove(fpath) raise ProgressTracker.progbar = None if untar: if not os.path.exists(untar_fpath): _extract_archive(fpath, datadir, archive_format='tar') return untar_fpath if extract: _extract_archive(fpath, datadir, archive_format) return fpath
def _read_from_url(url): filename, _ = urlretrieve(url) return open(filename, 'rb')
def get_file(fname, origin, untar=False, md5_hash=None, cache_subdir='common'): '''Downloads a file from a URL if it not already in the cache. Passing the MD5 hash will verify the file after download as well as if it is already present in the cache. # Arguments fname: name of the file origin: original URL of the file untar: boolean, whether the file should be decompressed md5_hash: MD5 hash of the file for verification cache_subdir: directory being used as the cache # Returns Path to the downloaded file ''' file_path = os.path.dirname(os.path.realpath(__file__)) datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data')) datadir = os.path.join(datadir_base, cache_subdir) if not os.path.exists(datadir): os.makedirs(datadir) #if untar: # fnamesplit = fname.split('.tar.gz') # untar_fpath = os.path.join(datadir, fnamesplit[0]) if fname.endswith('.tar.gz'): fnamesplit = fname.split('.tar.gz') untar_fpath = os.path.join(datadir, fnamesplit[0]) untar = True elif fname.endswith('.tgz'): fnamesplit = fname.split('.tgz') untar_fpath = os.path.join(datadir, fnamesplit[0]) untar = True fpath = os.path.join(datadir, fname) download = False if os.path.exists(fpath): # file found; verify integrity if a hash was provided if md5_hash is not None: if not validate_file(fpath, md5_hash): print('A local file was found, but it seems to be ' 'incomplete or outdated.') download = True else: download = True if download: print('Downloading data from', origin) global progbar progbar = None def dl_progress(count, block_size, total_size): global progbar if progbar is None: progbar = Progbar(total_size) else: progbar.update(count * block_size) error_msg = 'URL fetch failure on {}: {} -- {}' try: try: urlretrieve(origin, fpath, dl_progress) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except HTTPError as e: raise Exception(error_msg.format(origin, e.code, e.msg)) except (Exception, KeyboardInterrupt) as e: if os.path.exists(fpath): os.remove(fpath) raise progbar = None print() if untar: if not os.path.exists(untar_fpath): print('Untarring file...') tfile = tarfile.open(fpath, 'r:gz') try: tfile.extractall(path=datadir) except (Exception, KeyboardInterrupt) as e: if os.path.exists(untar_fpath): if os.path.isfile(untar_fpath): os.remove(untar_fpath) else: shutil.rmtree(untar_fpath) raise tfile.close() return untar_fpath print() return fpath
def download(dataset): with TemporaryDirectory() as tmpdir: urlretrieve(URL, tmpdir.joinpath(FILENAME).as_posix()) xls2csv(tmpdir.joinpath(FILENAME), outdir=dataset.raw)
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, GENCODE_dir=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing mappability_file = os.path.join( this_dir, "../../template/dataloader_files", "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file): print("Downloading the mappability file") urlretrieve( "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", mappability_file) print("Download complete") self.mappability_file = mappability_file self.mappability_extractor = None # Gencode features if GENCODE_dir is None: gp = os.path.join(this_dir, "dataloader_files/gencode_features/") else: gp = GENCODE_dir self.gencode_beds = [ ("cpg", BedTool(gp + '/cpgisland.bed.gz')), ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')), ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')), ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')), ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')), ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')), ] # Overlap beds - could be done incrementally print("Overlapping all the bed-files") # The BT() and .fn are there in order to leverage BedToolLinecache self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn)) for b, v in self.gencode_beds] print("Assesing the file") assert len(self.overlap_beds[1][1]) == len(self.bt) # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError( "RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line rp = os.path.join(this_dir, "dataloader_files/RNAseq_features/") RNAseq_PC_file = os.path.join(rp, cell_line, "meta.txt") self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values
def download (filename): if not os.path.exists (filename): filename, _ = urlretrieve (url + filename, filename) with zipfile.ZipFile (filename) as f: data = tf.compat.as_str (f.read(f.namelist()[0])).split () return data
from six.moves.urllib.request import urlretrieve url = 'http://yaroslavvb.com/upload/notMNIST/' filename = 'notMNIST_small.tar.gz' filename, _ = urlretrieve(url + filename, filename)
def test_SlicerRadiomics1(self): """ Ideally you should have several levels of tests. At the lowest level tests should exercise the functionality of the logic with different inputs (both valid and invalid). At higher levels your tests should emulate the way the user would interact with your code and confirm that it still works the way you intended. One of the most important features of the tests is that it should alert other developers when their changes will have an impact on the behavior of your module. For example, if a developer removes a feature that you depend on, your test should break so they know that the feature is needed. """ self.delayDisplay('Starting the test') # # first, get some data # https://github.com/Radiomics/SlicerRadiomics/releases/download/TestData-v1.0.0/lung1_binary.seg.nrrd from six.moves.urllib.request import urlretrieve dataRelease = 'v1.0.0' dataURLPrefix = 'https://github.com/Radiomics/SlicerRadiomics/releases/download/TestData' dataItems = (('lung1_image.nrrd', slicer.util.loadVolume), ('lung1_label.nrrd', slicer.util.loadLabelVolume), ('lung1_binary.seg.nrrd', slicer.util.loadSegmentation), ('lung1.seg_0.vtp', None), ('lung1.seg_1.vtp', None), ('lung1_surface.seg.vtm', slicer.util.loadSegmentation), ('Params.yaml', None)) for item, loader in dataItems: url = dataURLPrefix + '-' + dataRelease + '/' + item filePath = os.path.join(slicer.app.temporaryPath, item) if not os.path.exists(filePath) or os.stat(filePath).st_size == 0: self.logger.info('Requesting download %s from %s...\n' % (item, url)) self.assertTrue(urlretrieve(url, filePath), 'Failed to download from ' + url) if loader: self.logger.info('Loading %s from %s...' % (item, filePath)) self.assertTrue(loader(filePath), 'Failed to load ' + item) self.delayDisplay( 'Finished with download and loading %d volumes' % (slicer.mrmlScene.GetNumberOfNodesByClass('vtkMRMLVolumeNode'))) grayscaleNode = slicer.util.getNode(pattern='lung1_image') labelmapNode = slicer.util.getNode(pattern='lung1_label') binaryNode = slicer.util.getNode(pattern='lung1_binary') surfaceNode = slicer.util.getNode(pattern='lung1_surface') parameterFile = os.path.join(slicer.app.temporaryPath, 'Params.yaml') logic = SlicerRadiomicsLogic() logic.runSync = True # Block Thread until each extraction is done (i.e. run synchronously) self.assertIsNotNone(logic.hasImageData(grayscaleNode)) self.assertIsNotNone(logic.hasImageData(labelmapNode)) featureClasses = ['firstorder'] settings = {'binWidth': 25, 'symmetricalGLCM': False, 'label': 1} enabledImageTypes = {"Original": {}} for maskNode in [labelmapNode, binaryNode, surfaceNode]: tableNode = slicer.vtkMRMLTableNode() tableNode.SetName('lung1_label and ' + maskNode.GetName()) slicer.mrmlScene.AddNode(tableNode) # No callback needed as tests are run synchronously logic.runCLI(grayscaleNode, maskNode, tableNode, featureClasses, settings, enabledImageTypes) logic.showTable(tableNode) for maskNode in [labelmapNode, binaryNode, surfaceNode]: tableNode = slicer.vtkMRMLTableNode() tableNode.SetName('lung1_label and ' + maskNode.GetName() + ' customized with Params.yaml') slicer.mrmlScene.AddNode(tableNode) # No callback needed as tests are run synchronously logic.runCLIWithParameterFile(grayscaleNode, maskNode, tableNode, parameterFile) logic.showTable(tableNode) self.delayDisplay('Test passed!')
def download_dataset(fname, origin, untar=False): """Download a dataset, if not already there. Parameters ---------- fname: str Full filename of dataset, e.g. ``mnist.pkl.gz``. origin: str Location of dataset, e.g. url https://s3.amazonaws.com/img-datasets/mnist.pkl.gz untar: Optional[bool] If ``True``, untar file. Returns ------- fpath: str The path to the downloaded dataset. If the user has write access to ``home``, the dataset will be stored in ``~/.snntoolbox/datasets/``, otherwise in ``/tmp/.snntoolbox/datasets/``. Notes ----- Test under python2. """ import tarfile import shutil from six.moves.urllib.error import URLError, HTTPError # Under Python 2, 'urlretrieve' relies on FancyURLopener from legacy # urllib module, known to have issues with proxy management from six.moves.urllib.request import urlretrieve datadir_base = os.path.expanduser(os.path.join('~', '.snntoolbox')) if not os.access(datadir_base, os.W_OK): datadir_base = os.path.join('/tmp', '.snntoolbox') datadir = os.path.join(datadir_base, 'datasets') if not os.path.exists(datadir): os.makedirs(datadir) untar_fpath = None if untar: untar_fpath = os.path.join(datadir, fname) fpath = untar_fpath + '.tar.gz' else: fpath = os.path.join(datadir, fname) if not os.path.exists(fpath): print("Downloading data from {}".format(origin)) error_msg = 'URL fetch failure on {}: {} -- {}' try: try: urlretrieve(origin, fpath) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except HTTPError as e: raise Exception(error_msg.format(origin, e.code, e.msg)) except (Exception, KeyboardInterrupt) as e: if os.path.exists(fpath): os.remove(fpath) raise e if untar: if not os.path.exists(untar_fpath): print("Untaring file...\n") tfile = tarfile.open(fpath, 'r:gz') try: tfile.extractall(path=datadir) except (Exception, KeyboardInterrupt) as e: if os.path.exists(untar_fpath): if os.path.isfile(untar_fpath): os.remove(untar_fpath) else: shutil.rmtree(untar_fpath) raise e tfile.close() return untar_fpath return fpath
def load_word_vectors(root, wv_type, dim): """ From https://github.com/pytorch/text/ BSD 3-Clause License Copyright (c) James Bradbury and Soumith Chintala 2016, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ """Load word vectors from a path, trying .pt, .txt, and .zip extensions.""" if isinstance(dim, int): dim = str(dim) + 'd' fname = os.path.join(root, wv_type + '.' + dim) if os.path.isfile(fname + '.pt'): fname_pt = fname + '.pt' print('loading word vectors from', fname_pt) return torch.load(fname_pt) if os.path.isfile(fname + '.txt'): fname_txt = fname + '.txt' cm = open(fname_txt, 'rb') cm = [line for line in cm] elif os.path.basename(wv_type) in URL: url = URL[wv_type] print('downloading word vectors from {}'.format(url)) filename = os.path.basename(fname) if not os.path.exists(root): os.makedirs(root) with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t: fname, _ = urlretrieve(url, fname, reporthook=reporthook(t)) with zipfile.ZipFile(fname, "r") as zf: print('extracting word vectors into {}'.format(root)) zf.extractall(root) if not os.path.isfile(fname + '.txt'): raise RuntimeError('no word vectors of requested dimension found') return load_word_vectors(root, wv_type, dim) else: raise RuntimeError('unable to load word vectors %s from %s' % (wv_type, root)) wv_tokens, wv_arr, wv_size = [], array.array('d'), None if cm is not None: print("Loading word vectors from {}".format(fname_txt)) for line in trange(len(cm)): entries = cm[line].strip().split(b' ') word, entries = entries[0], entries[1:] if wv_size is None: wv_size = len(entries) try: if isinstance(word, six.binary_type): word = word.decode('utf-8') except: print('non-UTF8 token', repr(word), 'ignored') continue wv_arr.extend(float(x) for x in entries) wv_tokens.append(word) wv_dict = {word: i for i, word in enumerate(wv_tokens)} wv_arr = torch.Tensor(wv_arr).view(-1, wv_size) ret = (wv_dict, wv_arr, wv_size) torch.save(ret, fname + '.pt') return ret
def download_and_extract_zip_file(url, targetdir='.', verbose=True): import csv from six.moves.urllib.request import urlretrieve from six.moves.urllib.parse import urlparse from zipfile import ZipFile res = urlparse(url) if res.scheme in ('','file'): url = Path(res.path).resolve().as_uri() # local file, 'urlretrieve' will not make a copy # -> don't delete 'downloaded' file delete = False else: delete = True # verbosity levels: # - 0: no messages # - 1: status messages # - 2: status messages and list of all files if isinstance(verbose,bool): verbose *= 2 log = (print) if verbose else (lambda *a,**k: None) targetdir = Path(targetdir) if not targetdir.is_dir(): targetdir.mkdir(parents=True,exist_ok=True) provided = [] def content_is_missing(): try: filepath, http_msg = urlretrieve(url+'.contents') with open(filepath,'r') as contents_file: contents = list(csv.reader(contents_file,delimiter='\t')) except: return True finally: if delete: try: os.unlink(filepath) except: pass for size, relpath in contents: size, relpath = int(size.strip()), relpath.strip() entry = targetdir / relpath if not entry.exists(): return True if entry.is_dir(): if not relpath.endswith('/'): return True elif entry.is_file(): if relpath.endswith('/') or entry.stat().st_size != size: return True else: return True provided.append(relpath) return False if content_is_missing(): try: log('Files missing, downloading...',end='') filepath, http_msg = urlretrieve(url) with ZipFile(filepath,'r') as zip_file: log(' extracting...',end='') zip_file.extractall(str(targetdir)) provided = zip_file.namelist() log(' done.') finally: if delete: try: os.unlink(filepath) except: pass else: log('Files found, nothing to download.') if verbose > 1: log('\n'+str(targetdir)+':') consume(map(lambda x: log('-',Path(x)), provided))
def cache(self, name, cache, url=None): if os.path.isfile(name): path = name path_pt = os.path.join(cache, os.path.basename(name)) + '.pt' else: path = os.path.join(cache, name) path_pt = path + '.pt' if not os.path.isfile(path_pt): if not os.path.isfile(path) and url: logger.info('Downloading vectors from {}'.format(url)) if not os.path.exists(cache): os.makedirs(cache) dest = os.path.join(cache, os.path.basename(url)) if not os.path.isfile(dest): with tqdm(unit='B', unit_scale=True, miniters=1, desc=dest) as t: try: urlretrieve(url, dest, reporthook=reporthook(t)) except KeyboardInterrupt as e: # remove the partial zip file os.remove(dest) raise e logger.info('Extracting vectors into {}'.format(cache)) ext = os.path.splitext(dest)[1][1:] if ext == 'zip': with zipfile.ZipFile(dest, "r") as zf: zf.extractall(cache) elif ext == 'gz': with tarfile.open(dest, 'r:gz') as tar: tar.extractall(path=cache) if not os.path.isfile(path): raise RuntimeError('no vectors found at {}'.format(path)) # str call is necessary for Python 2/3 compatibility, since # argument must be Python 2 str (Python 3 bytes) or # Python 3 str (Python 2 unicode) itos, vectors, dim = [], array.array(str('d')), None # Try to read the whole file with utf-8 encoding. binary_lines = False try: with io.open(path, encoding="utf8") as f: lines = [line for line in f] # If there are malformed lines, read in binary mode # and manually decode each word from utf-8 except: logger.warning("Could not read {} as UTF8 file, " "reading file as bytes and skipping " "words with malformed UTF8.".format(path)) with open(path, 'rb') as f: lines = [line for line in f] binary_lines = True logger.info("Loading vectors from {}".format(path)) for line in tqdm(lines, total=len(lines)): # Explicitly splitting on " " is important, so we don't # get rid of Unicode non-breaking spaces in the vectors. entries = line.rstrip().split(b" " if binary_lines else " ") word, entries = entries[0], entries[1:] if dim is None and len(entries) > 1: dim = len(entries) elif len(entries) == 1: logger.warning("Skipping token {} with 1-dimensional " "vector {}; likely a header".format( word, entries)) continue elif dim != len(entries): raise RuntimeError( "Vector for token {} has {} dimensions, but previously " "read vectors have {} dimensions. All vectors must have " "the same number of dimensions.".format( word, len(entries), dim)) if binary_lines: try: if isinstance(word, six.binary_type): word = word.decode('utf-8') except: logger.info("Skipping non-UTF8 token {}".format( repr(word))) continue vectors.extend(float(x) for x in entries) itos.append(word) self.itos = itos self.stoi = {word: i for i, word in enumerate(itos)} self.vectors = torch.Tensor(vectors).view(-1, dim) self.dim = dim logger.info('Saving vectors to {}'.format(path_pt)) if not os.path.exists(cache): os.makedirs(cache) torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) else: logger.info('Loading vectors from {}'.format(path_pt)) self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
import os # make dependency directory if not exists('deps'): mkdir('deps') os.system('pip install cython') from Cython.Build import cythonize # download Eigen if we don't have it in deps eigenurl = 'http://bitbucket.org/eigen/eigen/get/3.2.6.tar.gz' eigentarpath = join('deps', 'Eigen.tar.gz') eigenpath = join('deps', 'Eigen') if not exists(eigenpath): print('Downloading Eigen...') urlretrieve(eigenurl, eigentarpath) with tarfile.open(eigentarpath, 'r') as tar: tar.extractall('deps') thedir = glob(join('deps', 'eigen-eigen-*'))[0] move(join(thedir, 'Eigen'), eigenpath) print('...done!') setup(name='autoregressive', version='0.1.1', description= 'Extension for switching vector autoregressive models with pyhsmm', author='Matthew James Johnson', author_email='*****@*****.**', url='https://github.com/mattjj/pyhsmm-autoregressive', license='GPL', packages=['autoregressive'],
def _download(self, url, options): # pylint: disable=unused-argument return urlretrieve(url)[0] # nocv
def get_file(filename: str, url: str, path: Optional[str] = None, extract: bool = False, verbose: bool = False) -> str: """ Downloads a file from a URL if it not already in the cache. The file at indicated by `url` is downloaded to the path `path` (default is ~/.art/data). and given the name `filename`. Files in tar, tar.gz, tar.bz, and zip formats can also be extracted. This is a simplified version of the function with the same name in Keras. :param filename: Name of the file. :param url: Download URL. :param path: Folder to store the download. If not specified, `~/.art/data` is used instead. :param extract: If true, tries to extract the archive. :param verbose: If true, print download progress bar. :return: Path to the downloaded file. """ if path is None: path_ = os.path.expanduser(config.ART_DATA_PATH) else: path_ = os.path.expanduser(path) if not os.access(path_, os.W_OK): path_ = os.path.join("/tmp", ".art") if not os.path.exists(path_): os.makedirs(path_) if extract: extract_path = os.path.join(path_, filename) full_path = extract_path + ".tar.gz" else: full_path = os.path.join(path_, filename) # Determine if dataset needs downloading download = not os.path.exists(full_path) if download: logger.info("Downloading data from %s", url) error_msg = "URL fetch failure on {}: {} -- {}" try: try: from six.moves.urllib.error import HTTPError, URLError from six.moves.urllib.request import urlretrieve # The following two lines should prevent occasionally occurring # [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:847) import ssl ssl._create_default_https_context = ssl._create_unverified_context # pylint: disable=W0212 if verbose: with tqdm() as t_bar: last_block = [0] def progress_bar(blocks: int = 1, block_size: int = 1, total_size: Optional[int] = None): """ :param blocks: Number of blocks transferred so far [default: 1]. :param block_size: Size of each block (in tqdm units) [default: 1]. :param total_size: Total size (in tqdm units). If [default: None] or -1, remains unchanged. """ if total_size not in (None, -1): t_bar.total = total_size displayed = t_bar.update((blocks - last_block[0]) * block_size) last_block[0] = blocks return displayed urlretrieve(url, full_path, reporthook=progress_bar) else: urlretrieve(url, full_path) except HTTPError as exception: raise Exception(error_msg.format(url, exception.code, exception.msg)) from HTTPError # type: ignore except URLError as exception: raise Exception(error_msg.format(url, exception.errno, exception.reason)) from HTTPError except (Exception, KeyboardInterrupt): if os.path.exists(full_path): os.remove(full_path) raise if extract: if not os.path.exists(extract_path): _extract(full_path, path_) return extract_path return full_path
def _download_file(file_name, url, local_folder, cache_subdir, file_hash=None, cache_dir=None, verbose=True): """ Downloads the specified file from the Heavily inspired by and lovingly adapted from keras' `get_file` function: https://github.com/fchollet/keras/blob/afbd5d34a3bdbb0916d558f96af197af1e92ce70/keras/utils/data_utils.py#L109 Args: file_name: (String) name of the file located on the server url: (String) url of the file local_folder: (String) alternate folder in which to download the file cache_subdir: (String) subdirectory of folder in which to download flie file_hash: (String) expected hash of downloaded file cache_dir: Returns: (String) local path to downloaded file """ if local_folder not in [None, '']: # local folder provided, let's create it if it doesn't exist and use it as datadir if not os.path.exists(os.path.expanduser(local_folder)): os.makedirs(os.path.expanduser(local_folder)) datadir = os.path.expanduser(local_folder) else: if cache_dir is None: cache_dir = os.path.expanduser(os.path.join('~', '.nussl')) datadir_base = os.path.expanduser(cache_dir) if not os.access(datadir_base, os.W_OK): datadir_base = os.path.join('/tmp', '.nussl') datadir = os.path.join(datadir_base, cache_subdir) if not os.path.exists(datadir): os.makedirs(datadir) file_path = os.path.join(datadir, file_name) download = False if os.path.exists(file_path): if file_hash is not None: # compare the provided hash with the hash of the file currently at file_path current_hash = _hash_file(file_path) # if the hashes are equal, we already have the file we need, so don't download if file_hash != current_hash: if verbose: warnings.warn("Hash for {} does not match known hash. " "Downloading {} from servers...".format( file_path, file_name)) download = True elif verbose: print('Matching file found at {}, skipping download.'.format( file_path)) else: download = True else: download = True if download: if verbose: print('Saving file at {}'.format(file_path)) print('Downloading {} from {}'.format(file_name, url)) def _dl_progress(count, block_size, total_size): percent = int(count * block_size * 100 / total_size) if percent <= 100: sys.stdout.write('\r{}...{}%'.format(file_name, percent)) sys.stdout.flush() error_msg = 'URL fetch failure on {}: {} -- {}' try: try: reporthook = _dl_progress if verbose else None urlretrieve(url, file_path, reporthook) if verbose: print() # print a new line after the progress is done. except HTTPError as e: raise FailedDownloadError(error_msg.format(url, e.code, e.msg)) except URLError as e: raise FailedDownloadError( error_msg.format(url, e.errno, e.reason)) except (Exception, KeyboardInterrupt) as e: if os.path.exists(file_path): os.remove(file_path) raise e # check hash of received file to see if it matches the provided hash if file_hash is not None: download_hash = _hash_file(file_path) if file_hash != download_hash: # the downloaded file is not what it should be. Get rid of it. os.remove(file_path) raise MismatchedHashError( "Downloaded file ({}) has been deleted " "because of a hash mismatch.".format(file_path)) return file_path else: return file_path
def readtext(self, image, decoder = 'greedy', beamWidth= 5, batch_size = 1,\ workers = 0, allowlist = None, blocklist = None, detail = 1,\ paragraph = False,\ contrast_ths = 0.1,adjust_contrast = 0.5, filter_ths = 0.003,\ text_threshold = 0.7, low_text = 0.4, link_threshold = 0.4,\ canvas_size = 2560, mag_ratio = 1.,\ slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5,\ width_ths = 0.5, add_margin = 0.1): ''' Parameters: file: file path or numpy-array or a byte stream object ''' if type(image) == str: if image.startswith('http://') or image.startswith('https://'): tmp, _ = urlretrieve(image, reporthook=printProgressBar( prefix='Progress:', suffix='Complete', length=50)) img_cv_grey = cv2.imread(tmp, cv2.IMREAD_GRAYSCALE) os.remove(tmp) else: img_cv_grey = cv2.imread(image, cv2.IMREAD_GRAYSCALE) image = os.path.expanduser(image) img = loadImage(image) # can accept URL elif type(image) == bytes: nparr = np.frombuffer(image, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img_cv_grey = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) elif type(image) == np.ndarray: if len(image.shape) == 2: # grayscale img_cv_grey = image img = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) elif len(image.shape) == 3: # BGRscale img = image img_cv_grey = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) text_box = get_textbox(self.detector, img, canvas_size, mag_ratio, text_threshold,\ link_threshold, low_text, False, self.device) horizontal_list, free_list = group_text_box(text_box, slope_ths, ycenter_ths, height_ths, width_ths, add_margin) # should add filter to screen small box out image_list, max_width = get_image_list(horizontal_list, free_list, img_cv_grey, model_height=imgH) if allowlist: ignore_char = ''.join(set(self.character) - set(allowlist)) elif blocklist: ignore_char = ''.join(set(blocklist)) else: ignore_char = ''.join(set(self.character) - set(self.lang_char)) if self.model_lang in [ 'chinese_tra', 'chinese_sim', 'japanese', 'korean' ]: decoder = 'greedy' result = get_text(self.character, imgH, int(max_width), self.recognizer, self.converter, image_list,\ ignore_char, decoder, beamWidth, batch_size, contrast_ths, adjust_contrast, filter_ths,\ workers, self.device) if self.model_lang == 'arabic': direction_mode = 'rtl' result = [list(item) for item in result] for item in result: item[1] = get_display(item[1]) else: direction_mode = 'ltr' if paragraph: result = get_paragraph(result, mode=direction_mode) if detail == 0: return [item[1] for item in result] else: return result
is_training=False, reuse=reuse) logits = logits[:, 1:] probs = tf.nn.softmax(logits) return logits, probs, end_points logits, probs, end_points = network(image, reuse=False) checkpoint_filename = "./inception_v3.ckpt" if not os.path.exists(checkpoint_filename): inception_tarball, _ = urlretrieve( "http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz") tarfile.open(inception_tarball, 'r:gz').extractall("./") restore_vars = [ var for var in tf.global_variables() if var.name.startswith('InceptionV3/') ] saver = tf.train.Saver(restore_vars) saver.restore(sess, "./inception_v3.ckpt") def get_feature(img, feature_layer_name): p, feature_values = sess.run([probs, end_points], feed_dict={image: img})
def cache(self, name, cache, url=None, max_vectors=None): if os.path.isfile(name): path = name if max_vectors: file_suffix = '_{}.pt'.format(max_vectors) else: file_suffix = '.pt' path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix else: path = os.path.join(cache, name) if max_vectors: file_suffix = '_{}.pt'.format(max_vectors) else: file_suffix = '.pt' path_pt = path + file_suffix if not os.path.isfile(path_pt): if not os.path.isfile(path) and url: logger.info('Downloading vectors from {}'.format(url)) if not os.path.exists(cache): os.makedirs(cache) dest = os.path.join(cache, os.path.basename(url)) if not os.path.isfile(dest): with tqdm(unit='B', unit_scale=True, miniters=1, desc=dest) as t: try: urlretrieve(url, dest, reporthook=reporthook(t)) except KeyboardInterrupt as e: # remove the partial zip file os.remove(dest) raise e logger.info('Extracting vectors into {}'.format(cache)) ext = os.path.splitext(dest)[1][1:] if ext == 'zip': with zipfile.ZipFile(dest, "r") as zf: zf.extractall(cache) elif ext == 'gz': if dest.endswith('.tar.gz'): with tarfile.open(dest, 'r:gz') as tar: tar.extractall(path=cache) if not os.path.isfile(path): raise RuntimeError('no vectors found at {}'.format(path)) logger.info("Loading vectors from {}".format(path)) ext = os.path.splitext(path)[1][1:] if ext == 'gz': open_file = gzip.open else: open_file = open vectors_loaded = 0 with open_file(path, 'rb') as f: num_lines, dim = _infer_shape(f) if not max_vectors or max_vectors > num_lines: max_vectors = num_lines itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None for line in tqdm(f, total=num_lines): # Explicitly splitting on " " is important, so we don't # get rid of Unicode non-breaking spaces in the vectors. entries = line.rstrip().split(b" ") word, entries = entries[0], entries[1:] if dim is None and len(entries) > 1: dim = len(entries) elif len(entries) == 1: logger.warning("Skipping token {} with 1-dimensional " "vector {}; likely a header".format(word, entries)) continue elif dim != len(entries): raise RuntimeError( "Vector for token {} has {} dimensions, but previously " "read vectors have {} dimensions. All vectors must have " "the same number of dimensions.".format(word, len(entries), dim)) try: if isinstance(word, six.binary_type): word = word.decode('utf-8') except UnicodeDecodeError: logger.info("Skipping non-UTF8 token {}".format(repr(word))) continue vectors[vectors_loaded] = torch.tensor([float(x) for x in entries]) vectors_loaded += 1 itos.append(word) if vectors_loaded == max_vectors: break self.itos = itos self.stoi = {word: i for i, word in enumerate(itos)} self.vectors = torch.Tensor(vectors).view(-1, dim) self.dim = dim logger.info('Saving vectors to {}'.format(path_pt)) if not os.path.exists(cache): os.makedirs(cache) torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) else: logger.info('Loading vectors from {}'.format(path_pt)) self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
def _download(self, url: Text, options) -> FILENAME: # pylint: disable=unused-argument return urlretrieve(url)[0] # nocv
def load_word_vectors(root, wv_type, dim): """Load word vectors from a path, trying .pt, .txt, and .zip extensions.""" if isinstance(dim, int): dim = str(dim) + 'd' fname = os.path.join(root, wv_type + '.' + dim) if os.path.isfile(fname + '.pt'): fname_pt = fname + '.pt' print('loading word vectors from', fname_pt) try: return torch.load(fname_pt) except Exception as e: print(""" Error loading the model from {} This could be because this code was previously run with one PyTorch version to generate cached data and is now being run with another version. You can try to delete the cached files on disk (this file and others) and re-running the code Error message: --------- {} """.format(fname_pt, str(e))) sys.exit(-1) if os.path.isfile(fname + '.txt'): fname_txt = fname + '.txt' cm = open(fname_txt, 'rb') cm = [line for line in cm] elif os.path.basename(wv_type) in URL: url = URL[wv_type] print('downloading word vectors from {}'.format(url)) filename = os.path.basename(fname) if not os.path.exists(root): os.makedirs(root) with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t: fname, _ = urlretrieve(url, fname, reporthook=reporthook(t)) with zipfile.ZipFile(fname, "r") as zf: print('extracting word vectors into {}'.format(root)) zf.extractall(root) if not os.path.isfile(fname + '.txt'): raise RuntimeError('no word vectors of requested dimension found') return load_word_vectors(root, wv_type, dim) else: raise RuntimeError('unable to load word vectors') wv_tokens, wv_arr, wv_size = [], array.array('d'), None if cm is not None: for line in tqdm( range(len(cm)), desc="loading word vectors from {}".format(fname_txt)): entries = cm[line].strip().split(b' ') word, entries = entries[0], entries[1:] if wv_size is None: wv_size = len(entries) try: if isinstance(word, six.binary_type): word = word.decode('utf-8') except: print('non-UTF8 token', repr(word), 'ignored') continue wv_arr.extend(float(x) for x in entries) wv_tokens.append(word) wv_dict = {word: i for i, word in enumerate(wv_tokens)} wv_arr = torch.Tensor(wv_arr).view(-1, wv_size) ret = (wv_dict, wv_arr, wv_size) torch.save(ret, fname + '.pt') return ret
def get_file(fname, origin, save_path, untar=False, md5_hash=None, cache_subdir='datasets'): """Downloads a file from a URL if it not already in the cache. Passing the MD5 hash will verify the file after download as well as if it is already present in the cache. Usually it downloads the file to save_path/cache_dubdir/fname Arguments --------- fname: name of the file origin: original URL of the file save_path: path to create cache_subdir. untar: boolean, whether the file should be decompressed md5_hash: MD5 hash of the file for verification cache_subdir: directory being used as the cache Returns ------- Path to the downloaded file """ datadir_base = save_path if not os.access(datadir_base, os.W_OK): datadir_base = os.path.expanduser(os.path.join('~', '.kapre')) print( 'Given path {} is not accessible. Trying to use~/.kapre instead..') if not os.access(datadir_base, os.W_OK): print('~/.kapre is not accessible, using /tmp/kapre instead.') datadir_base = os.path.join('/tmp', '.kapre') datadir = os.path.join(datadir_base, cache_subdir) if not os.path.exists(datadir): os.makedirs(datadir) if untar: assert fname.endswith('.tar.gz'), fname fpath = os.path.join(datadir, fname) untar_fpath = fpath.rstrip('.tar.gz') else: fpath = os.path.join(datadir, fname) download = False if os.path.exists(fpath): # File found; verify integrity if a hash was provided. if md5_hash is not None: if not validate_file(fpath, md5_hash): print('A local file was found, but it seems to be ' 'incomplete or outdated.') download = True else: download = True if download: print('Downloading data from', origin) progbar = None def dl_progress(count, block_size, total_size, progbar=None): if progbar is None: progbar = Progbar(total_size) else: progbar.update(count * block_size) error_msg = 'URL fetch failure on {}: {} -- {}' try: try: urlretrieve(origin, fpath, functools.partial(dl_progress, progbar=progbar)) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except HTTPError as e: raise Exception(error_msg.format(origin, e.code, e.msg)) except (Exception, KeyboardInterrupt) as e: if os.path.exists(fpath): os.remove(fpath) raise progbar = None if untar: if not os.path.exists(untar_fpath): print('Untaring file...') tfile = tarfile.open(fpath, 'r:gz') try: tfile.extractall(path=datadir) except (Exception, KeyboardInterrupt) as e: if os.path.exists(untar_fpath): if os.path.isfile(untar_fpath): os.remove(untar_fpath) else: shutil.rmtree(untar_fpath) raise tfile.close() # return untar_fpath return datadir
def get_file(fname: str = None, origin: str = None, untar: bool = False, extract: bool = False, md5_hash: typing.Any = None, file_hash: typing.Any = None, hash_algorithm: str = 'auto', archive_format: str = 'auto', cache_subdir: typing.Union[Path, str] = 'data', cache_dir: typing.Union[Path, str] = 'dataset', verbose: int = 1) -> str: """ Downloads a file from a URL if it not already in the cache. By default the file at the url `origin` is downloaded to the cache_dir `~/.project/datasets`, placed in the cache_subdir `data`, and given the filename `fname`. The final location of a file `example.txt` would therefore be `~/.project/datasets/data/example.txt`. Files in tar, tar.gz, tar.bz, and zip formats can also be extracted. Passing a hash will verify the file after download. The command line programs `shasum` and `sha256sum` can compute the hash. :param fname: Name of the file. If an absolute path `/path/to/file.txt` is specified the file will be saved at that location. :param origin: Original URL of the file. :param untar: Deprecated in favor of 'extract'. Boolean, whether the file should be decompressed. :param md5_hash: Deprecated in favor of 'file_hash'. md5 hash of the file for verification. :param file_hash: The expected hash string of the file after download. The sha256 and md5 hash algorithms are both supported. :param cache_subdir: Subdirectory under the cache dir where the file is saved. If an absolute path `/path/to/folder` is specified the file will be saved at that location. :param hash_algorithm: Select the hash algorithm to verify the file. options are 'md5', 'sha256', and 'auto'. The default 'auto' detects the hash algorithm in use. :papram extract: True tries extracting the file as an Archive, like tar or zip. :param archive_format: Archive format to try for extracting the file. Options are 'auto', 'tar', 'zip', and None. 'tar' includes tar, tar.gz, and tar.bz files. The default 'auto' is ['tar', 'zip']. None or an empty list will return no matches found. :param cache_dir: Location to store cached files, when None it defaults to the [project.USER_DATA_DIR](~/.project/datasets). :param verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose) :return: Path to the downloaded file. """ if md5_hash is not None and file_hash is None: file_hash = md5_hash hash_algorithm = 'md5' datadir_base = os.path.expanduser(cache_dir) if not os.access(datadir_base, os.W_OK): datadir_base = os.path.join('/tmp', '.project') datadir = os.path.join(datadir_base, cache_subdir) if not os.path.exists(datadir): os.makedirs(datadir) if untar: untar_fpath = os.path.join(datadir, fname) fpath = untar_fpath + '.tar.gz' else: fpath = os.path.join(datadir, fname) download = False if os.path.exists(fpath): if file_hash is not None: if not validate_file(fpath, file_hash, algorithm=hash_algorithm): print('A local file was found, but it seems to be ' 'incomplete or outdated because the file hash ' 'does not match the original value of file_hash.' ' We will re-download the data.') download = True else: download = True if download: print('Downloading data from', origin) class ProgressTracker(object): progbar = None def dl_progress(count, block_size, total_size): if ProgressTracker.progbar is None: if total_size == -1: total_size = None ProgressTracker.progbar = Progbar(target=total_size, verbose=verbose) else: ProgressTracker.progbar.update(count * block_size) error_msg = 'URL fetch failure on {} : {} -- {}' try: try: from six.moves.urllib.request import urlretrieve urlretrieve(origin, fpath, dl_progress) except HTTPError as e: raise Exception(error_msg.format(origin, e.code, e.msg)) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except (Exception, KeyboardInterrupt): if os.path.exists(fpath): os.remove(fpath) raise ProgressTracker.progbar = None if untar: if not os.path.exists(untar_fpath): _extract_archive(fpath, datadir, archive_format='tar') return untar_fpath if extract: _extract_archive(fpath, datadir, archive_format) return fpath