def cache_zipped_file(file_name: str, url: str) -> Path: """Check if file_name already is in the data path, otherwise download it from url. Args: file_name: the file name url: the URL of the dataset Returns: The relative path to the dataset """ data_path = get_data_path() data_path.mkdir(exist_ok=True) file_path = data_path / file_name # If not exists, download and create file if not file_path.exists(): response = requests.get(url) if response.status_code != 200: raise FileNotFoundError("Could not download resource") tmp_path = data_path / "tmp.zip" tmp_path.write_bytes(response.content) with zipfile.ZipFile(tmp_path, "r") as zip_file: zip_file.extract(file_path.name, data_path) tmp_path.unlink() return file_path
def cache_file(file_name, url): # Cache all datasets here data_path = get_data_path() data_path.mkdir(exist_ok=True) # If not exists, download and create file if not (data_path / file_name).exists(): data = requests.get(url) (data_path / file_name).write_bytes(data.content) return data_path / file_name
def cache_file(file_name: str, url: str) -> Path: """Check if file_name already is in the data path, otherwise download it from url. Args: file_name: the file name url: the URL of the dataset Returns: The relative path to the dataset """ data_path = get_data_path() data_path.mkdir(exist_ok=True) # If not exists, download and create file if not (data_path / file_name).exists(): data = requests.get(url) (data_path / file_name).write_bytes(data.content) return data_path / file_name
import kaggle import pandas as pd import pandas_profiling from pandas_profiling.utils.paths import get_data_path # The dataset in this example is obtained using the `kaggle` api. # If you haven't done so already, you should set up the api credentials: # https://github.com/Kaggle/kaggle-api#api-credentials kaggle.api.authenticate() # Download the dataset. Note that we use a small dataset as this example is automated. # However, popular larger files shouldn't be a problem (LFW, CelebA). data_path = get_data_path() / "cat-and-dog" kaggle.api.dataset_download_files( "tongpython/cat-and-dog", path=str(data_path), quiet=False, unzip=True, ) # At the first run, we find that the dataset not only contains images, "_DS_Store" and "cat-and-dog.zip" are present too. # We remove these by narrowing our glob search files = [f for f in data_path.rglob("*.jpg") if f.is_file()] series = pd.Series(files, name="files") # PP only accepts absolute paths series = series.apply(lambda x: x.absolute()).apply(str) df = pd.DataFrame(series)
import kaggle import pandas as pd import pandas_profiling from pandas_profiling.utils.paths import get_data_path # The dataset in this example is obtained using the `kaggle` api. # If you haven't done so already, you should set up the api credentials: # https://github.com/Kaggle/kaggle-api#api-credentials kaggle.api.authenticate() # Download the dataset. Note that we use a small dataset as this example is automated. # However, popular larger files shouldn't be a problem (LFW, CelebA). data_path = get_data_path() / "celebrity-faces" kaggle.api.dataset_download_files( "dansbecker/5-celebrity-faces-dataset", path=str(data_path), quiet=False, unzip=True, ) p = data_path / "data/train/" # As not all datasets have an index of files, we generate that ourselves. files = [f for f in p.rglob("*") if f.is_file()] series = pd.Series(files, name="files") # PP only accepts absolute paths series = series.apply(lambda x: x.absolute()).apply(str) df = pd.DataFrame(series)