def _load_datafile(size, local_cache_path): """ Download and extract file """ if size not in _data_format: raise ValueError(ERROR_MOVIE_LENS_SIZE) if not local_cache_path.endswith(".zip"): raise ValueError(ERROR_LOCAL_CACHE_PATH) path, filename = os.path.split(os.path.realpath(local_cache_path)) # Make sure a temporal zip file get cleaned up no matter what atexit.register(_clean_up, local_cache_path) maybe_download( "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip", filename, work_directory=path, ) _, dataname = os.path.split(_data_format[size].path) if dataname == "": # this will never happen unless someone changes _data_format raise ValueError("Invalid data file name.") datapath = os.path.join(path, dataname) with ZipFile(local_cache_path, "r") as z: with z.open(_data_format[size].path) as zf, open(datapath, 'wb') as f: shutil.copyfileobj(zf, f) _clean_up(local_cache_path) # Make sure a temporal data file get cleaned up when done atexit.register(_clean_up, datapath) return datapath
def test_maybe_download(): file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE" filepath = "license.txt" assert not os.path.exists(filepath) filepath = maybe_download(file_url, "license.txt", expected_bytes=1162) assert os.path.exists(filepath) os.remove(filepath) with pytest.raises(IOError): filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def test_maybe_download(): # TODO: change this file to the repo license when it is public file_url = "https://raw.githubusercontent.com/Microsoft/vscode/master/LICENSE.txt" filepath = "license.txt" assert not os.path.exists(filepath) filepath = maybe_download(file_url, "license.txt", expected_bytes=1110) assert os.path.exists(filepath) # TODO: download again and test that the file is already there, grab the log?? os.remove(filepath) with pytest.raises(IOError): filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def download_movielens(size, dest_path): """Downloads MovieLens datafile. Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). dest_path (str): File path for the downloaded file """ if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip" dirs, file = os.path.split(dest_path) maybe_download(url, file, work_directory=dirs)
def download_deeprec_resources(azure_container_url, data_path, remote_resource_name): """Download resources. Args: azure_container_url (str): URL of Azure container. data_path (str): Path to download the resources. remote_resource_name (str): Name of the resource. """ os.makedirs(data_path, exist_ok=True) remote_path = azure_container_url + remote_resource_name maybe_download(remote_path, remote_resource_name, data_path) zip_ref = zipfile.ZipFile(os.path.join(data_path, remote_resource_name), "r") zip_ref.extractall(data_path) zip_ref.close() os.remove(os.path.join(data_path, remote_resource_name))
def download_criteo(size="sample", work_directory="."): """Download criteo dataset as a compressed file. Args: size (str): Size of criteo dataset. It can be "full" or "sample". work_directory (str): Working directory. Returns: str: Path of the downloaded file. """ url = CRITEO_URL[size] return maybe_download(url, work_directory=work_directory)