def download_file(url, data_path=BIGDATA_PATH, filename=None, size=None, chunk_size=4096, verbose=True): """Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https""" if filename is None: filename = dropbox_basename(url) if not isinstance(url, (basestring, str)): return [ download_file(s, data_path=data_path, filename=filename, size=size, chunk_size=chunk_size, verbose=verbose) for s in url ] filepath = expand_filepath(os.path.join(data_path, filename)) if url.endswith('dl=0'): url = url[:-1] + '1' # noninteractive Dropbox download tqdm_prog = tqdm if verbose else no_tqdm logger.info('requesting URL: {}'.format(url)) r = requests.get(url, stream=True, allow_redirects=True) size = r.headers.get('Content-Length', -1) if size is None else size try: size = int(size) except ValueError: size = -1 logger.info('remote size: {}'.format(size)) stat = path_status(filepath) logger.info('local size: {}'.format(stat.get('size', None))) if stat['type'] == 'file' and stat[ 'size'] >= size: # TODO: check md5 or get the right size of remote file r.close() logger.info('retained: {}'.format(filepath)) return filepath filedir = os.path.dirname(filepath) created_dir = mkdir_p(filedir) if verbose: logger.info('data path created: {}'.format(created_dir)) assert os.path.isdir(filedir) assert created_dir.endswith(filedir) logger.info('downloaded: {}'.format(filepath)) with open(filepath, 'wb') as f: for chunk in tqdm_prog(r.iter_content(chunk_size=chunk_size), total=ceil(size / float(chunk_size))): if chunk: # filter out keep-alive chunks f.write(chunk) r.close() logger.debug('nlpia.loaders.download_file: return filepath=' + str(filepath)) return filepath
def download_file(url, data_path=BIGDATA_PATH, filename=None, size=None, chunk_size=4096, verbose=True): """Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https""" if filename is None: filename = dropbox_basename(url) file_path = os.path.join(data_path, filename) if url.endswith('?dl=0'): url = url[:-1] + '1' # noninteractive download if verbose: tqdm_prog = tqdm print('requesting URL: {}'.format(url)) else: tqdm_prog = no_tqdm r = requests.get(url, stream=True, allow_redirects=True) size = r.headers.get('Content-Length', None) if size is None else size print('remote size: {}'.format(size)) stat = path_status(file_path) print('local size: {}'.format(stat.get('size', None))) if stat['type'] == 'file' and stat[ 'size'] >= size: # TODO: check md5 or get the right size of remote file r.close() return file_path print('Downloading to {}'.format(file_path)) with open(file_path, 'wb') as f: for chunk in tqdm_prog(r.iter_content(chunk_size=chunk_size)): if chunk: # filter out keep-alive chunks f.write(chunk) r.close() return file_path