示例#1
0
def test_maybe_download_retry(caplog):
    caplog.clear()
    caplog.set_level(logging.INFO)
    with pytest.raises(requests.exceptions.HTTPError):
        maybe_download(
            "https://recodatasets.z20.web.core.windows.net/non_existing_file.zip"
        )
        assert "Problem downloading" in caplog.text
示例#2
0
def test_maybe_download_maybe(caplog, files_fixtures):
    caplog.clear()
    caplog.set_level(logging.INFO)

    file_url, filepath = files_fixtures
    if os.path.exists(filepath):
        os.remove(filepath)

    downloaded_filepath = maybe_download(file_url, "license.txt")
    assert os.path.exists(downloaded_filepath)
    maybe_download(file_url, "license.txt")
    assert "File ./license.txt already downloaded" in caplog.text
示例#3
0
def download_movielens(size, dest_path):
    """Downloads MovieLens datafile.

    Args:
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        dest_path (str): File path for the downloaded file
    """
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip"
    dirs, file = os.path.split(dest_path)
    maybe_download(url, file, work_directory=dirs)
def _download_reviews(name, dest_path):
    """Downloads Amazon reviews datafile.

    Args:
        name (str): Category of reviews
        dest_path (str): File path for the downloaded file
    """

    url = ("http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/" +
           name + ".gz")

    dirs, file = os.path.split(dest_path)
    maybe_download(url, file + ".gz", work_directory=dirs)
示例#5
0
def download_deeprec_resources(azure_container_url, data_path, remote_resource_name):
    """Download resources.

    Args:
        azure_container_url (str): URL of Azure container.
        data_path (str): Path to download the resources.
        remote_resource_name (str): Name of the resource.
    """
    os.makedirs(data_path, exist_ok=True)
    remote_path = azure_container_url + remote_resource_name
    maybe_download(remote_path, remote_resource_name, data_path)
    zip_ref = zipfile.ZipFile(os.path.join(data_path, remote_resource_name), "r")
    zip_ref.extractall(data_path)
    zip_ref.close()
    os.remove(os.path.join(data_path, remote_resource_name))
示例#6
0
def download_mind(size="small", dest_path=None):
    """Download MIND dataset

    Args:
        size (str): Dataset size. One of ["small", "large"]
        dest_path (str): Download path. If path is None, it will download the dataset on a temporal path

    Returns:
        str, str: Path to train and validation sets.
    """
    size_options = ["small", "large", "demo"]
    if size not in size_options:
        raise ValueError(f"Wrong size option, available options are {size_options}")
    url_train, url_valid = URL_MIND[size]
    with download_path(dest_path) as path:
        train_path = maybe_download(url=url_train, work_directory=path)
        valid_path = maybe_download(url=url_valid, work_directory=path)
    return train_path, valid_path
示例#7
0
def test_maybe_download(files_fixtures):
    file_url, filepath = files_fixtures
    if os.path.exists(filepath):
        os.remove(filepath)

    downloaded_filepath = maybe_download(file_url,
                                         "license.txt",
                                         expected_bytes=1162)
    assert os.path.exists(downloaded_filepath)
    assert downloaded_filepath.split("/")[-1] == "license.txt"
示例#8
0
def test_maybe_download_wrong_bytes(caplog, files_fixtures):
    caplog.clear()
    caplog.set_level(logging.INFO)

    file_url, filepath = files_fixtures
    if os.path.exists(filepath):
        os.remove(filepath)

    with pytest.raises(IOError):
        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
        assert "Failed to verify license.txt" in caplog.text
示例#9
0
def download_criteo(size="sample", work_directory="."):
    """Download criteo dataset as a compressed file.

    Args:
        size (str): Size of criteo dataset. It can be "full" or "sample".
        work_directory (str): Working directory.

    Returns:
        str: Path of the downloaded file.

    """
    url = CRITEO_URL[size]
    return maybe_download(url, work_directory=work_directory)
示例#10
0
def download_and_extract_glove(dest_path):
    """Download and extract the Glove embedding

    Args:
        dest_path (str): Destination directory path for the downloaded file

    Returns:
        str: File path where Glove was extracted.
    """
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    filepath = maybe_download(url=url, work_directory=dest_path)
    glove_path = os.path.join(dest_path, "glove")
    unzip_file(filepath, glove_path, clean_zip_file=False)
    return glove_path