def test_read_parquet(ray_start_regular_shared, tmp_path): df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) table = pa.Table.from_pandas(df1) pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) table = pa.Table.from_pandas(df2) pq.write_table(table, os.path.join(tmp_path, "test2.parquet")) # without columns ds = ml_data.read_parquet(tmp_path, num_shards=2) result = list(ds.gather_sync()) assert df1.equals(result[0]) assert df2.equals(result[1]) # with columns one ds = ml_data.read_parquet(tmp_path, num_shards=2, columns=["one"]) result = list(ds.gather_sync()) assert df1[["one"]].equals(result[0]) assert df2[["one"]].equals(result[1]) # with columns two ds = ml_data.read_parquet(tmp_path, num_shards=2, columns=["two"]) result = list(ds.gather_sync()) assert df1[["two"]].equals(result[0]) assert df2[["two"]].equals(result[1])
def testDetectDistributed(self): with tempfile.TemporaryDirectory() as dir: parquet_file = os.path.join(dir, "file.parquet") csv_file = os.path.join(dir, "file.csv") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) data_df.to_parquet(parquet_file) data_df.to_csv(csv_file) mat = RayDMatrix(parquet_file, lazy=True) self.assertTrue(mat.distributed) mat = RayDMatrix(csv_file, lazy=True) # Single CSV files should not be distributed self.assertFalse(mat.distributed) mat = RayDMatrix([parquet_file] * 3, lazy=True) self.assertTrue(mat.distributed) mat = RayDMatrix([csv_file] * 3, lazy=True) self.assertTrue(mat.distributed) try: from ray.util import data as ml_data mat = RayDMatrix(ml_data.read_parquet(parquet_file, num_shards=1), lazy=True) self.assertTrue(mat.distributed) except ImportError: print("MLDataset not available in current Ray version. " "Skipping part of test.")
def testFromMLDataset(self): try: from ray.util import data as ml_data except ImportError: self.skipTest("MLDataset not available in current Ray version.") return with tempfile.TemporaryDirectory() as dir: data_file_1 = os.path.join(dir, "data_1.parquet") data_file_2 = os.path.join(dir, "data_2.parquet") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) df_1 = data_df[0:len(data_df) // 2] df_2 = data_df[len(data_df) // 2:] df_1.to_parquet(data_file_1) df_2.to_parquet(data_file_2) dataset = ml_data.read_parquet([data_file_1, data_file_2], num_shards=2) self._testMatrixCreation(dataset, "label", distributed=False) self._testMatrixCreation(dataset, "label", distributed=True)
def main(fname, num_actors=2): ml_dataset = read_parquet(fname, num_shards=num_actors) dtrain = RayDMatrix(ml_dataset, label="labels", ignore=["partition"]) config = { "tree_method": "hist", "eval_metric": ["logloss", "error"], } evals_result = {} start = time.time() bst = train(config, dtrain, evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), num_boost_round=10, evals=[(dtrain, "train")]) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("test_data.xgb") print("Final training error: {:.4f}".format( evals_result["train"]["error"][-1]))
def create_ml_dataset_from_spark( df: sql.DataFrame, num_shards: int, batch_size: int, fs_directory: Optional[str] = None, compression: Optional[str] = None) -> MLDataset: """ Create a MLDataset from Spark DataFrame This method will create a MLDataset from Spark DataFrame. :param df: the pyspark.sql.DataFrame :param num_shards: the number of shards will be created for the MLDataset :param batch_size: the batch size for the MLDataset :param fs_directory: an optional distributed file system directory for cache the DataFrame. We will write the DataFrame to the given directory with parquet format if this is provided. Otherwise, we will write the DataFrame to ray object store. :param compression: the optional compression for write the DataFrame as parquet file. This is only useful when the fs_directory set. :return: a MLDataset """ df = df.repartition(num_shards) if fs_directory is None: # fs_directory has not provided, we save the Spark DataFrame to ray object store record_batch_set = _save_spark_df_to_object_store(df, num_shards) # TODO: we should specify the resource spec for each shard it = parallel_it.from_iterators(generators=record_batch_set, name="Spark DataFrame", repeat=False) ds = ml_dataset.from_parallel_iter(it, need_convert=False, batch_size=batch_size, repeated=False) return ds else: # fs_directory has provided, we write the Spark DataFrame as Parquet files df.write.parquet(fs_directory, compression=compression) # create the MLDataset from the parquet file ds = ml_dataset.read_parquet(fs_directory, num_shards) return ds