Exemplo n.º 1
0
def cache_zipped_file(file_name: str, url: str) -> Path:
    """Check if file_name already is in the data path, otherwise download it from url.

    Args:
        file_name: the file name
        url: the URL of the dataset

    Returns:
        The relative path to the dataset
    """

    data_path = get_data_path()
    data_path.mkdir(exist_ok=True)

    file_path = data_path / file_name

    # If not exists, download and create file
    if not file_path.exists():
        response = requests.get(url)
        if response.status_code != 200:
            raise FileNotFoundError("Could not download resource")

        tmp_path = data_path / "tmp.zip"
        tmp_path.write_bytes(response.content)

        with zipfile.ZipFile(tmp_path, "r") as zip_file:
            zip_file.extract(file_path.name, data_path)

        tmp_path.unlink()

    return file_path
Exemplo n.º 2
0
def cache_file(file_name, url):
    # Cache all datasets here
    data_path = get_data_path()
    data_path.mkdir(exist_ok=True)

    # If not exists, download and create file
    if not (data_path / file_name).exists():
        data = requests.get(url)
        (data_path / file_name).write_bytes(data.content)
    return data_path / file_name
Exemplo n.º 3
0
def cache_file(file_name: str, url: str) -> Path:
    """Check if file_name already is in the data path, otherwise download it from url.

    Args:
        file_name: the file name
        url: the URL of the dataset

    Returns:
        The relative path to the dataset
    """

    data_path = get_data_path()
    data_path.mkdir(exist_ok=True)

    # If not exists, download and create file
    if not (data_path / file_name).exists():
        data = requests.get(url)
        (data_path / file_name).write_bytes(data.content)

    return data_path / file_name
Exemplo n.º 4
0
import kaggle
import pandas as pd

import pandas_profiling
from pandas_profiling.utils.paths import get_data_path

# The dataset in this example is obtained using the `kaggle` api.
# If you haven't done so already, you should set up the api credentials:
# https://github.com/Kaggle/kaggle-api#api-credentials
kaggle.api.authenticate()


# Download the dataset. Note that we use a small dataset as this example is automated.
# However, popular larger files shouldn't be a problem (LFW, CelebA).
data_path = get_data_path() / "cat-and-dog"
kaggle.api.dataset_download_files(
    "tongpython/cat-and-dog",
    path=str(data_path),
    quiet=False,
    unzip=True,
)

# At the first run, we find that the dataset not only contains images, "_DS_Store" and "cat-and-dog.zip" are present too.
# We remove these by narrowing our glob search
files = [f for f in data_path.rglob("*.jpg") if f.is_file()]
series = pd.Series(files, name="files")

# PP only accepts absolute paths
series = series.apply(lambda x: x.absolute()).apply(str)

df = pd.DataFrame(series)
Exemplo n.º 5
0
import kaggle
import pandas as pd

import pandas_profiling
from pandas_profiling.utils.paths import get_data_path

# The dataset in this example is obtained using the `kaggle` api.
# If you haven't done so already, you should set up the api credentials:
# https://github.com/Kaggle/kaggle-api#api-credentials
kaggle.api.authenticate()

# Download the dataset. Note that we use a small dataset as this example is automated.
# However, popular larger files shouldn't be a problem (LFW, CelebA).
data_path = get_data_path() / "celebrity-faces"
kaggle.api.dataset_download_files(
    "dansbecker/5-celebrity-faces-dataset",
    path=str(data_path),
    quiet=False,
    unzip=True,
)

p = data_path / "data/train/"

# As not all datasets have an index of files, we generate that ourselves.
files = [f for f in p.rglob("*") if f.is_file()]
series = pd.Series(files, name="files")
# PP only accepts absolute paths
series = series.apply(lambda x: x.absolute()).apply(str)

df = pd.DataFrame(series)