def test_batch_mapper(): """Tests batch mapper functionality.""" old_column = [1, 2, 3, 4] to_be_modified = [1, -1, 1, -1] in_df = pd.DataFrame.from_dict({ "old_column": old_column, "to_be_modified": to_be_modified }) ds = ray.data.from_pandas(in_df) def add_and_modify_udf(df: "pd.DataFrame"): df["new_col"] = df["old_column"] + 1 df["to_be_modified"] *= 2 return df batch_mapper = BatchMapper(fn=add_and_modify_udf) batch_mapper.fit(ds) transformed = batch_mapper.transform(ds) out_df = transformed.to_pandas() expected_df = pd.DataFrame.from_dict({ "old_column": old_column, "to_be_modified": [2, -2, 2, -2], "new_col": [2, 3, 4, 5], }) assert out_df.equals(expected_df)
def test_fit_transform_config(ray_start_4_cpus): ds = ray.data.range_table(10) def drop_odd(rows): key = list(rows)[0] return rows[(rows[key] % 2 == 0)] prep = BatchMapper(drop_odd) # Single worker basic case. test = TestBasic( 1, True, {"train": 5, "test": 5}, dataset_config={}, datasets={"train": ds, "test": ds}, preprocessor=prep, ) test.fit() # No transform for test. test = TestBasic( 1, True, {"train": 5, "test": 10}, dataset_config={"test": DatasetConfig(transform=False)}, datasets={"train": ds, "test": ds}, preprocessor=prep, ) test.fit()
def test_error(ray_start_4_cpus): ds = ray.data.range_table(10) # Missing required dataset. with pytest.raises(ValueError): TestBasic( 1, True, {"train": 10, "test": 10}, dataset_config={}, datasets={"test": ds} ) # Missing optional dataset is OK. test = TestBasic( 1, True, {"train": 10}, dataset_config={}, datasets={"train": ds}, preprocessor=BatchMapper(lambda x: x), ) test.fit() # Extra dataset. with pytest.raises(ValueError): TestBasic( 1, True, {"train": 10, "test": 10}, dataset_config={}, datasets={"train": ds, "blah": ds}, )
def main(data_size_gb: int, num_epochs=2, num_workers=1): data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw" print("Running Pytorch image model training with " f"{data_size_gb}GB data from {data_url}") print(f"Training for {num_epochs} epochs with {num_workers} workers.") start = time.time() # Enable cross host NCCL for larger scale tests runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}} ray.init(runtime_env=runtime_env) dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url]) preprocessor = BatchMapper(preprocess_image_with_label) trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "batch_size": 64, "num_epochs": num_epochs }, datasets={"train": dataset}, preprocessor=preprocessor, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True), ) trainer.fit() total_time_s = round(time.time() - start, 2) # For structured output integration with internal tooling results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs} results["perf_metrics"] = [ { "perf_metric_name": "total_time_s", "perf_metric_value": total_time_s, "perf_metric_type": "LATENCY", }, { "perf_metric_name": "throughout_MB_s", "perf_metric_value": round(num_epochs * data_size_gb * 1024 / total_time_s, 2), "perf_metric_type": "THROUGHPUT", }, ] test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/release_test_out.json") with open(test_output_json, "wt") as f: json.dump(results, f) print(results)
def test_stream_finite_window_nocache_prep(ray_start_4_cpus): def rand(x): return [random.random() for _ in range(len(x))] prep = BatchMapper(rand) ds = ray.data.range_table(5) # Test the default 1GiB window size. def checker(shard, results): results = [sorted(r) for r in results] assert int(results[0][0]) != results[0][0] assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 5/5 blocks executed " in stats ), stats test = TestStream( checker, preprocessor=prep, datasets={"train": ds}, dataset_config={"train": DatasetConfig()}, ) test.fit() # Test a smaller window size. def checker(shard, results): results = [sorted(r) for r in results] assert int(results[0][0]) != results[0][0] assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 1/1 blocks executed " in stats ), stats test = TestStream( checker, preprocessor=prep, datasets={"train": ds}, dataset_config={"train": DatasetConfig(stream_window_size=10)}, ) test.fit()
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker): dummy_prep = BatchMapper(lambda df: df * 2) trainer = DummyTrainer( scaling_config=ScalingConfig( num_workers=num_workers, trainer_resources={"CPU": 0}, resources_per_worker={"CPU": num_cpus_per_worker}, _max_cpu_fraction_per_node=0.1, ), datasets={"train": dataset}, preprocessor=dummy_prep, num_epochs=1, prefetch_blocks=1, dataset_config={"train": DatasetConfig(split=True)}, ) trainer.fit()
def main(data_size_gb: int): data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw" print( f"Running GPU batch prediction with {data_size_gb}GB data from {data_url}" ) start = time.time() dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url]) model = resnet18(pretrained=True) preprocessor = BatchMapper(preprocess) ckpt = TorchCheckpoint.from_model(model=model, preprocessor=preprocessor) predictor = BatchPredictor.from_checkpoint(ckpt, TorchPredictor) predictor.predict(dataset, num_gpus_per_worker=1, feature_columns=["image"]) total_time_s = round(time.time() - start, 2) # For structured output integration with internal tooling results = { "data_size_gb": data_size_gb, } results["perf_metrics"] = [ { "perf_metric_name": "total_time_s", "perf_metric_value": total_time_s, "perf_metric_type": "LATENCY", }, { "perf_metric_name": "throughout_MB_s", "perf_metric_value": (data_size_gb * 1024 / total_time_s), "perf_metric_type": "THROUGHPUT", }, ] test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/release_test_out.json") with open(test_output_json, "wt") as f: json.dump(results, f) print(results)
def test_stream_inf_window_cache_prep(ray_start_4_cpus): def checker(shard, results): results = [sorted(r) for r in results] assert len(results[0]) == 5, results assert results[0] == results[1], results stats = shard.stats() assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert "Stage 1 read->map_batches: 5/5 blocks executed " in stats, stats def rand(x): return [random.random() for _ in range(len(x))] prep = BatchMapper(rand) ds = ray.data.range_table(5) test = TestStream( checker, preprocessor=prep, datasets={"train": ds}, dataset_config={"train": DatasetConfig(stream_window_size=-1)}, ) test.fit()
parser.add_argument( "--use-stream-api", "-s", action="store_true", help= "If enabled, the input Dataset will be streamed (as a DatasetPipeline).", ) args = parser.parse_args() # Generate a synthetic dataset of ~10GiB of float64 data. The dataset is sharded # into 100 blocks (parallelism=100). dataset = ray.data.range_tensor(50000, shape=(80, 80, 4), parallelism=100) # An example preprocessor chain that just scales all values by 4.0 in two stages. preprocessor = Chain( BatchMapper(lambda df: df * 2), BatchMapper(lambda df: df * 2), ) # Setup the dummy trainer that prints ingest stats. # Run and print ingest stats. trainer = DummyTrainer( scaling_config={ "num_workers": 1, "use_gpu": False }, datasets={"train": dataset}, preprocessor=preprocessor, num_epochs=args.num_epochs, prefetch_blocks=args.prefetch_blocks, dataset_config={
def test_chain(): """Tests basic Chain functionality.""" col_a = [-1, -1, 1, 1] col_b = [1, 1, 1, None] col_c = ["sunday", "monday", "tuesday", "tuesday"] in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c}) ds = ray.data.from_pandas(in_df) def udf(df): df["A"] *= 2 return df batch_mapper = BatchMapper(fn=udf) imputer = SimpleImputer(["B"]) scaler = StandardScaler(["A", "B"]) encoder = LabelEncoder("C") chain = Chain(scaler, imputer, encoder, batch_mapper) # Fit data. chain.fit(ds) assert imputer.stats_ == { "mean(B)": 0.0, } assert scaler.stats_ == { "mean(A)": 0.0, "mean(B)": 1.0, "std(A)": 1.0, "std(B)": 0.0, } assert encoder.stats_ == { "unique_values(C)": { "monday": 0, "sunday": 1, "tuesday": 2 } } # Transform data. transformed = chain.transform(ds) out_df = transformed.to_pandas() processed_col_a = [-2.0, -2.0, 2.0, 2.0] processed_col_b = [0.0, 0.0, 0.0, 0.0] processed_col_c = [1, 0, 2, 2] expected_df = pd.DataFrame.from_dict({ "A": processed_col_a, "B": processed_col_b, "C": processed_col_c }) assert out_df.equals(expected_df) # Transform batch. pred_col_a = [1, 2, None] pred_col_b = [0, None, 2] pred_col_c = ["monday", "tuesday", "wednesday"] pred_in_df = pd.DataFrame.from_dict({ "A": pred_col_a, "B": pred_col_b, "C": pred_col_c }) pred_out_df = chain.transform_batch(pred_in_df) pred_processed_col_a = [2, 4, None] pred_processed_col_b = [-1.0, 0.0, 1.0] pred_processed_col_c = [0, 2, None] pred_expected_df = pd.DataFrame.from_dict({ "A": pred_processed_col_a, "B": pred_processed_col_b, "C": pred_processed_col_c, }) assert pred_out_df.equals(pred_expected_df)
dataset_transformed = preprocessor.fit_transform(dataset) print(dataset_transformed.take()) # [{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}, {'value': 0.5}] # __chain_end__ # __custom_stateless_start__ import ray from ray.data.preprocessors import BatchMapper # Generate a simple dataset. dataset = ray.data.range_table(4) print(dataset.take()) # [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}] # Create a stateless preprocess that multiplies values by 2. preprocessor = BatchMapper(lambda df: df * 2) dataset_transformed = preprocessor.transform(dataset) print(dataset_transformed.take()) # [{'value': 0}, {'value': 2}, {'value': 4}, {'value': 6}] # __custom_stateless_end__ # __custom_stateful_start__ from typing import Dict import ray from pandas import DataFrame from ray.data.preprocessors import CustomStatefulPreprocessor from ray.data import Dataset from ray.data.aggregate import Max def get_max(ds: Dataset):
def preprocess(df: pd.DataFrame) -> pd.DataFrame: """ User Pytorch code to transform user image. Note we still use pandas as intermediate format to hold images as shorthand of python dictionary. """ preprocess = transforms.Compose( [ transforms.ToTensor(), transforms.Resize(256), transforms.CenterCrop(224), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) df["image"] = TensorArray([preprocess(x.to_numpy()) for x in df["image"]]) return df data_url = "s3://anonymous@air-example-data-2/1G-image-data-synthetic-raw" print(f"Running GPU batch prediction with 1GB data from {data_url}") dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url]) model = resnet18(pretrained=True) preprocessor = BatchMapper(preprocess) ckpt = TorchCheckpoint.from_model(model=model, preprocessor=preprocessor) predictor = BatchPredictor.from_checkpoint(ckpt, TorchPredictor) predictor.predict(dataset, feature_columns=["image"])