def test_download_path(): # Check that the temporal path is created and deleted with download_path() as path: assert os.path.isdir(path) assert not os.path.isdir(path) # Check the behavior when a path is provided tmp_dir = TemporaryDirectory() with download_path(tmp_dir.name) as path: assert os.path.isdir(path) assert os.path.isdir(path)
def load_item_df( size="100k", local_cache_path=None, movie_col=DEFAULT_ITEM_COL, title_col=None, genres_col=None, year_col=None, ): """Loads Movie info. Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. movie_col (str): Movie id column name. title_col (str): Movie title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. Returns: pandas.DataFrame: Movie information data, such as title, genres, and release year. """ size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) _, item_datapath = _maybe_download_and_extract(size, filepath) item_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col) return item_df
def load_pandas_df(size="sample", local_cache_path=None, header=DEFAULT_HEADER): """Loads the Criteo DAC dataset as `pandas.DataFrame`. This function download, untar, and load the dataset. The dataset consists of a portion of Criteo’s traffic over a period of 24 days. Each row corresponds to a display ad served by Criteo and the first column indicates whether this ad has been clicked or not. There are 13 features taking integer values (mostly count features) and 26 categorical features. The values of the categorical features have been hashed onto 32 bits for anonymization purposes. The schema is: .. code-block:: python <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26> More details (need to accept user terms to see the information): http://labs.criteo.com/2013/12/download-terabyte-click-logs/ Args: size (str): Dataset size. It can be "sample" or "full". local_cache_path (str): Path where to cache the tar.gz file locally header (list): Dataset header names. Returns: pandas.DataFrame: Criteo DAC sample dataset. """ with download_path(local_cache_path) as path: filepath = download_criteo(size, path) filepath = extract_criteo(size, filepath) df = pd.read_csv(filepath, sep="\t", header=None, names=header) return df
def get_spark_df( cls, spark, size: int = 3, seed: int = 100, keep_title_col: bool = False, keep_genre_col: bool = False, tmp_path: Optional[str] = None, ): """Return fake movielens dataset as a Spark Dataframe with specified rows Args: spark (SparkSession): spark session to load the dataframe into size (int): number of rows to generate seed (int): seeding the pseudo-number generation. Defaults to 100. keep_title_col (bool): remove the title column if False. Defaults to False. keep_genre_col (bool): remove the genre column if False. Defaults to False. tmp_path (str, optional): path to store files for serialization purpose when transferring data from python to java. If None, a temporal path is used instead Returns: pyspark.sql.DataFrame: a mock dataset """ pandas_df = cls.get_df( size=size, seed=seed, keep_title_col=True, keep_genre_col=True ) # generate temp folder with download_path(tmp_path) as tmp_folder: filepath = os.path.join(tmp_folder, f"mock_movielens_{size}.csv") # serialize the pandas.df as a csv to avoid the expensive java <-> python communication pandas_df.to_csv(filepath, header=False, index=False) spark_df = spark.read.csv( filepath, schema=cls._get_spark_deserialization_schema() ) # Cache and force trigger action since data-file might be removed. spark_df.cache() spark_df.count() if not keep_title_col: spark_df = spark_df.drop(DEFAULT_TITLE_COL) if not keep_genre_col: spark_df = spark_df.drop(DEFAULT_GENRE_COL) return spark_df
def download_mind(size="small", dest_path=None): """Download MIND dataset Args: size (str): Dataset size. One of ["small", "large"] dest_path (str): Download path. If path is None, it will download the dataset on a temporal path Returns: str, str: Path to train and validation sets. """ size_options = ["small", "large", "demo"] if size not in size_options: raise ValueError(f"Wrong size option, available options are {size_options}") url_train, url_valid = URL_MIND[size] with download_path(dest_path) as path: train_path = maybe_download(url=url_train, work_directory=path) valid_path = maybe_download(url=url_valid, work_directory=path) return train_path, valid_path
def load_spark_df( spark, size="100k", header=None, schema=None, local_cache_path=None, dbutils=None, title_col=None, genres_col=None, year_col=None, ): """Loads the MovieLens dataset as `pyspark.sql.DataFrame`. Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load as `pyspark.sql.DataFrame`. To load movie information only, you can use `load_item_df` function. Args: spark (pyspark.SparkSession): Spark session. size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). header (list or tuple): Rating dataset header. If schema is provided, this argument is ignored. schema (pyspark.StructType): Dataset schema. local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. dbutils (Databricks.dbutils): Databricks utility object title_col (str): Title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. Returns: pyspark.sql.DataFrame: Movie rating dataset. **Examples** .. code-block:: python # To load just user-id, item-id, and ratings from MovieLens-1M dataset: spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating')) # The schema can be defined as well: schema = StructType([ StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), StructField(DEFAULT_TIMESTAMP_COL, LongType()), ]) spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'), schema=schema) # To load rating's timestamp together: spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp')) # To load movie's title, genres, and released year info along with the ratings data: spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'), title_col='Title', genres_col='Genres', year_col='Year' ) # On DataBricks, pass the dbutils argument as follows: spark_df = load_spark_df(spark, dbutils=dbutils) """ size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) schema = _get_schema(header, schema) if len(schema) < 2: raise ValueError(ERROR_HEADER) movie_col = schema[1].name with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) datapath, item_datapath = _maybe_download_and_extract(size, filepath) spark_datapath = "file:///" + datapath # shorten form of file://localhost/ # Load movie features such as title, genres, and release year. # Since the file size is small, we directly load as pd.DataFrame from the driver node # and then convert into pyspark.sql.DataFrame item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col) item_df = spark.createDataFrame( item_pd_df) if item_pd_df is not None else None if is_databricks(): if dbutils is None: raise ValueError(""" To use on a Databricks, dbutils object should be passed as an argument. E.g. load_spark_df(spark, dbutils=dbutils) """) # Move rating file to DBFS in order to load into pyspark.sql.DataFrame dbfs_datapath = "dbfs:/tmp/" + datapath dbutils.fs.mv(spark_datapath, dbfs_datapath) spark_datapath = dbfs_datapath # pyspark's read csv currently doesn't support multi-character delimiter, thus we manually handle that separator = DATA_FORMAT[size].separator if len(separator) > 1: raw_data = spark.sparkContext.textFile(spark_datapath) data_rdd = raw_data.map(lambda l: l.split(separator)).map( lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][:len(schema)]) df = spark.createDataFrame(data_rdd, schema) else: df = spark.read.csv( spark_datapath, schema=schema, sep=separator, header=DATA_FORMAT[size].has_header, ) # Merge rating df w/ item_df if item_df is not None: df = df.join(item_df, movie_col, "left") # Cache and force trigger action since data-file might be removed. df.cache() df.count() return df
def load_pandas_df( size="100k", header=None, local_cache_path=None, title_col=None, genres_col=None, year_col=None, ): """Loads the MovieLens dataset as pd.DataFrame. Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load. To load movie information only, you can use load_item_df function. Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). header (list or tuple or None): Rating dataset header. local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. title_col (str): Movie title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. Returns: pandas.DataFrame: Movie rating dataset. **Examples** .. code-block:: python # To load just user-id, item-id, and ratings from MovieLens-1M dataset, df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating')) # To load rating's timestamp together, df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating', 'Timestamp')) # To load movie's title, genres, and released year info along with the ratings data, df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'), title_col='Title', genres_col='Genres', year_col='Year' ) """ size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) if header is None: header = DEFAULT_HEADER elif len(header) < 2: raise ValueError(ERROR_HEADER) elif len(header) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] movie_col = header[1] with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) datapath, item_datapath = _maybe_download_and_extract(size, filepath) # Load movie features such as title, genres, and release year item_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col) # Load rating data df = pd.read_csv( datapath, sep=DATA_FORMAT[size].separator, engine="python", names=header, usecols=[*range(len(header))], header=0 if DATA_FORMAT[size].has_header else None, ) # Convert 'rating' type to float if len(header) > 2: df[header[2]] = df[header[2]].astype(float) # Merge rating df w/ item_df if item_df is not None: df = df.merge(item_df, on=header[1]) return df
def load_spark_df( spark, size="sample", header=DEFAULT_HEADER, local_cache_path=None, dbfs_datapath="dbfs:/FileStore/dac", dbutils=None, ): """Loads the Criteo DAC dataset as `pySpark.DataFrame`. The dataset consists of a portion of Criteo’s traffic over a period of 24 days. Each row corresponds to a display ad served by Criteo and the first column is indicates whether this ad has been clicked or not. There are 13 features taking integer values (mostly count features) and 26 categorical features. The values of the categorical features have been hashed onto 32 bits for anonymization purposes. The schema is: .. code-block:: python <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26> More details (need to accept user terms to see the information): http://labs.criteo.com/2013/12/download-terabyte-click-logs/ Args: spark (pySpark.SparkSession): Spark session. size (str): Dataset size. It can be "sample" or "full". local_cache_path (str): Path where to cache the tar.gz file locally. header (list): Dataset header names. dbfs_datapath (str): Where to store the extracted files on Databricks. dbutils (Databricks.dbutils): Databricks utility object. Returns: pyspark.sql.DataFrame: Criteo DAC training dataset. """ with download_path(local_cache_path) as path: filepath = download_criteo(size, path) filepath = extract_criteo(size, filepath) if is_databricks(): try: # Driver node's file path node_path = "file:" + filepath # needs to be on dbfs to load dbutils.fs.cp(node_path, dbfs_datapath, recurse=True) path = dbfs_datapath except Exception: raise ValueError( "To use on a Databricks notebook, dbutils object should be passed as an argument" ) else: path = filepath schema = get_spark_schema(header) df = spark.read.csv(path, schema=schema, sep="\t", header=False) df.cache().count( ) # trigger execution to overcome spark's lazy evaluation return df