예제 #1
0
def _load_datafile(size, local_cache_path):
    """ Download and extract file """

    if size not in _data_format:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)
    if not local_cache_path.endswith(".zip"):
        raise ValueError(ERROR_LOCAL_CACHE_PATH)

    path, filename = os.path.split(os.path.realpath(local_cache_path))

    # Make sure a temporal zip file get cleaned up no matter what
    atexit.register(_clean_up, local_cache_path)

    maybe_download(
        "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip",
        filename,
        work_directory=path,
    )

    _, dataname = os.path.split(_data_format[size].path)
    if dataname == "":
        # this will never happen unless someone changes _data_format
        raise ValueError("Invalid data file name.")
    datapath = os.path.join(path, dataname)

    with ZipFile(local_cache_path, "r") as z:
        with z.open(_data_format[size].path) as zf, open(datapath, 'wb') as f:
            shutil.copyfileobj(zf, f)

    _clean_up(local_cache_path)

    # Make sure a temporal data file get cleaned up when done
    atexit.register(_clean_up, datapath)

    return datapath
예제 #2
0
def test_maybe_download():
    file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE"
    filepath = "license.txt"
    assert not os.path.exists(filepath)
    filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
    assert os.path.exists(filepath)
    os.remove(filepath)
    with pytest.raises(IOError):
        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
예제 #3
0
def test_maybe_download():
    # TODO: change this file to the repo license when it is public
    file_url = "https://raw.githubusercontent.com/Microsoft/vscode/master/LICENSE.txt"
    filepath = "license.txt"
    assert not os.path.exists(filepath)
    filepath = maybe_download(file_url, "license.txt", expected_bytes=1110)
    assert os.path.exists(filepath)
    # TODO: download again and test that the file is already there, grab the log??
    os.remove(filepath)
    with pytest.raises(IOError):
        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
예제 #4
0
def download_movielens(size, dest_path):
    """Downloads MovieLens datafile.

    Args:
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        dest_path (str): File path for the downloaded file
    """
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip"
    dirs, file = os.path.split(dest_path)
    maybe_download(url, file, work_directory=dirs)
예제 #5
0
def download_deeprec_resources(azure_container_url, data_path,
                               remote_resource_name):
    """Download resources.

    Args:
        azure_container_url (str): URL of Azure container.
        data_path (str): Path to download the resources.
        remote_resource_name (str): Name of the resource.
    """
    os.makedirs(data_path, exist_ok=True)
    remote_path = azure_container_url + remote_resource_name
    maybe_download(remote_path, remote_resource_name, data_path)
    zip_ref = zipfile.ZipFile(os.path.join(data_path, remote_resource_name),
                              "r")
    zip_ref.extractall(data_path)
    zip_ref.close()
    os.remove(os.path.join(data_path, remote_resource_name))
예제 #6
0
def download_criteo(size="sample", work_directory="."):
    """Download criteo dataset as a compressed file.

    Args:
        size (str): Size of criteo dataset. It can be "full" or "sample".
        work_directory (str): Working directory.

    Returns:
        str: Path of the downloaded file.

    """
    url = CRITEO_URL[size]
    return maybe_download(url, work_directory=work_directory)