Exemplo n.º 1
0
def test_batch_mapper():
    """Tests batch mapper functionality."""
    old_column = [1, 2, 3, 4]
    to_be_modified = [1, -1, 1, -1]
    in_df = pd.DataFrame.from_dict({
        "old_column": old_column,
        "to_be_modified": to_be_modified
    })
    ds = ray.data.from_pandas(in_df)

    def add_and_modify_udf(df: "pd.DataFrame"):
        df["new_col"] = df["old_column"] + 1
        df["to_be_modified"] *= 2
        return df

    batch_mapper = BatchMapper(fn=add_and_modify_udf)
    batch_mapper.fit(ds)
    transformed = batch_mapper.transform(ds)
    out_df = transformed.to_pandas()

    expected_df = pd.DataFrame.from_dict({
        "old_column": old_column,
        "to_be_modified": [2, -2, 2, -2],
        "new_col": [2, 3, 4, 5],
    })

    assert out_df.equals(expected_df)
Exemplo n.º 2
0
def test_fit_transform_config(ray_start_4_cpus):
    ds = ray.data.range_table(10)

    def drop_odd(rows):
        key = list(rows)[0]
        return rows[(rows[key] % 2 == 0)]

    prep = BatchMapper(drop_odd)

    # Single worker basic case.
    test = TestBasic(
        1,
        True,
        {"train": 5, "test": 5},
        dataset_config={},
        datasets={"train": ds, "test": ds},
        preprocessor=prep,
    )
    test.fit()

    # No transform for test.
    test = TestBasic(
        1,
        True,
        {"train": 5, "test": 10},
        dataset_config={"test": DatasetConfig(transform=False)},
        datasets={"train": ds, "test": ds},
        preprocessor=prep,
    )
    test.fit()
Exemplo n.º 3
0
def test_error(ray_start_4_cpus):
    ds = ray.data.range_table(10)

    # Missing required dataset.
    with pytest.raises(ValueError):
        TestBasic(
            1, True, {"train": 10, "test": 10}, dataset_config={}, datasets={"test": ds}
        )

    # Missing optional dataset is OK.
    test = TestBasic(
        1,
        True,
        {"train": 10},
        dataset_config={},
        datasets={"train": ds},
        preprocessor=BatchMapper(lambda x: x),
    )
    test.fit()

    # Extra dataset.
    with pytest.raises(ValueError):
        TestBasic(
            1,
            True,
            {"train": 10, "test": 10},
            dataset_config={},
            datasets={"train": ds, "blah": ds},
        )
Exemplo n.º 4
0
def main(data_size_gb: int, num_epochs=2, num_workers=1):
    data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw"
    print("Running Pytorch image model training with "
          f"{data_size_gb}GB data from {data_url}")
    print(f"Training for {num_epochs} epochs with {num_workers} workers.")
    start = time.time()
    # Enable cross host NCCL for larger scale tests
    runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
    ray.init(runtime_env=runtime_env)
    dataset = ray.data.read_datasource(ImageFolderDatasource(),
                                       paths=[data_url])

    preprocessor = BatchMapper(preprocess_image_with_label)

    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config={
            "batch_size": 64,
            "num_epochs": num_epochs
        },
        datasets={"train": dataset},
        preprocessor=preprocessor,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True),
    )
    trainer.fit()

    total_time_s = round(time.time() - start, 2)

    # For structured output integration with internal tooling
    results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs}
    results["perf_metrics"] = [
        {
            "perf_metric_name": "total_time_s",
            "perf_metric_value": total_time_s,
            "perf_metric_type": "LATENCY",
        },
        {
            "perf_metric_name":
            "throughout_MB_s",
            "perf_metric_value":
            round(num_epochs * data_size_gb * 1024 / total_time_s, 2),
            "perf_metric_type":
            "THROUGHPUT",
        },
    ]

    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/release_test_out.json")
    with open(test_output_json, "wt") as f:
        json.dump(results, f)

    print(results)
Exemplo n.º 5
0
def test_stream_finite_window_nocache_prep(ray_start_4_cpus):
    def rand(x):
        return [random.random() for _ in range(len(x))]

    prep = BatchMapper(rand)
    ds = ray.data.range_table(5)

    # Test the default 1GiB window size.
    def checker(shard, results):
        results = [sorted(r) for r in results]
        assert int(results[0][0]) != results[0][0]
        assert len(results[0]) == 5, results
        assert results[0] != results[1], results
        stats = shard.stats()
        assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard
        assert (
            "Stage 1 read->randomize_block_order->map_batches: 5/5 blocks executed "
            in stats
        ), stats

    test = TestStream(
        checker,
        preprocessor=prep,
        datasets={"train": ds},
        dataset_config={"train": DatasetConfig()},
    )
    test.fit()

    # Test a smaller window size.
    def checker(shard, results):
        results = [sorted(r) for r in results]
        assert int(results[0][0]) != results[0][0]
        assert len(results[0]) == 5, results
        assert results[0] != results[1], results
        stats = shard.stats()
        assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard
        assert (
            "Stage 1 read->randomize_block_order->map_batches: 1/1 blocks executed "
            in stats
        ), stats

    test = TestStream(
        checker,
        preprocessor=prep,
        datasets={"train": ds},
        dataset_config={"train": DatasetConfig(stream_window_size=10)},
    )
    test.fit()
Exemplo n.º 6
0
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker):
    dummy_prep = BatchMapper(lambda df: df * 2)
    trainer = DummyTrainer(
        scaling_config=ScalingConfig(
            num_workers=num_workers,
            trainer_resources={"CPU": 0},
            resources_per_worker={"CPU": num_cpus_per_worker},
            _max_cpu_fraction_per_node=0.1,
        ),
        datasets={"train": dataset},
        preprocessor=dummy_prep,
        num_epochs=1,
        prefetch_blocks=1,
        dataset_config={"train": DatasetConfig(split=True)},
    )
    trainer.fit()
Exemplo n.º 7
0
def main(data_size_gb: int):
    data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw"
    print(
        f"Running GPU batch prediction with {data_size_gb}GB data from {data_url}"
    )
    start = time.time()
    dataset = ray.data.read_datasource(ImageFolderDatasource(),
                                       paths=[data_url])

    model = resnet18(pretrained=True)

    preprocessor = BatchMapper(preprocess)
    ckpt = TorchCheckpoint.from_model(model=model, preprocessor=preprocessor)

    predictor = BatchPredictor.from_checkpoint(ckpt, TorchPredictor)
    predictor.predict(dataset,
                      num_gpus_per_worker=1,
                      feature_columns=["image"])
    total_time_s = round(time.time() - start, 2)

    # For structured output integration with internal tooling
    results = {
        "data_size_gb": data_size_gb,
    }
    results["perf_metrics"] = [
        {
            "perf_metric_name": "total_time_s",
            "perf_metric_value": total_time_s,
            "perf_metric_type": "LATENCY",
        },
        {
            "perf_metric_name": "throughout_MB_s",
            "perf_metric_value": (data_size_gb * 1024 / total_time_s),
            "perf_metric_type": "THROUGHPUT",
        },
    ]

    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/release_test_out.json")
    with open(test_output_json, "wt") as f:
        json.dump(results, f)

    print(results)
Exemplo n.º 8
0
def test_stream_inf_window_cache_prep(ray_start_4_cpus):
    def checker(shard, results):
        results = [sorted(r) for r in results]
        assert len(results[0]) == 5, results
        assert results[0] == results[1], results
        stats = shard.stats()
        assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard
        assert "Stage 1 read->map_batches: 5/5 blocks executed " in stats, stats

    def rand(x):
        return [random.random() for _ in range(len(x))]

    prep = BatchMapper(rand)
    ds = ray.data.range_table(5)
    test = TestStream(
        checker,
        preprocessor=prep,
        datasets={"train": ds},
        dataset_config={"train": DatasetConfig(stream_window_size=-1)},
    )
    test.fit()
Exemplo n.º 9
0
    parser.add_argument(
        "--use-stream-api",
        "-s",
        action="store_true",
        help=
        "If enabled, the input Dataset will be streamed (as a DatasetPipeline).",
    )
    args = parser.parse_args()

    # Generate a synthetic dataset of ~10GiB of float64 data. The dataset is sharded
    # into 100 blocks (parallelism=100).
    dataset = ray.data.range_tensor(50000, shape=(80, 80, 4), parallelism=100)

    # An example preprocessor chain that just scales all values by 4.0 in two stages.
    preprocessor = Chain(
        BatchMapper(lambda df: df * 2),
        BatchMapper(lambda df: df * 2),
    )

    # Setup the dummy trainer that prints ingest stats.
    # Run and print ingest stats.
    trainer = DummyTrainer(
        scaling_config={
            "num_workers": 1,
            "use_gpu": False
        },
        datasets={"train": dataset},
        preprocessor=preprocessor,
        num_epochs=args.num_epochs,
        prefetch_blocks=args.prefetch_blocks,
        dataset_config={
Exemplo n.º 10
0
def test_chain():
    """Tests basic Chain functionality."""
    col_a = [-1, -1, 1, 1]
    col_b = [1, 1, 1, None]
    col_c = ["sunday", "monday", "tuesday", "tuesday"]
    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c})
    ds = ray.data.from_pandas(in_df)

    def udf(df):
        df["A"] *= 2
        return df

    batch_mapper = BatchMapper(fn=udf)
    imputer = SimpleImputer(["B"])
    scaler = StandardScaler(["A", "B"])
    encoder = LabelEncoder("C")
    chain = Chain(scaler, imputer, encoder, batch_mapper)

    # Fit data.
    chain.fit(ds)
    assert imputer.stats_ == {
        "mean(B)": 0.0,
    }
    assert scaler.stats_ == {
        "mean(A)": 0.0,
        "mean(B)": 1.0,
        "std(A)": 1.0,
        "std(B)": 0.0,
    }
    assert encoder.stats_ == {
        "unique_values(C)": {
            "monday": 0,
            "sunday": 1,
            "tuesday": 2
        }
    }

    # Transform data.
    transformed = chain.transform(ds)
    out_df = transformed.to_pandas()

    processed_col_a = [-2.0, -2.0, 2.0, 2.0]
    processed_col_b = [0.0, 0.0, 0.0, 0.0]
    processed_col_c = [1, 0, 2, 2]
    expected_df = pd.DataFrame.from_dict({
        "A": processed_col_a,
        "B": processed_col_b,
        "C": processed_col_c
    })

    assert out_df.equals(expected_df)

    # Transform batch.
    pred_col_a = [1, 2, None]
    pred_col_b = [0, None, 2]
    pred_col_c = ["monday", "tuesday", "wednesday"]
    pred_in_df = pd.DataFrame.from_dict({
        "A": pred_col_a,
        "B": pred_col_b,
        "C": pred_col_c
    })

    pred_out_df = chain.transform_batch(pred_in_df)

    pred_processed_col_a = [2, 4, None]
    pred_processed_col_b = [-1.0, 0.0, 1.0]
    pred_processed_col_c = [0, 2, None]
    pred_expected_df = pd.DataFrame.from_dict({
        "A": pred_processed_col_a,
        "B": pred_processed_col_b,
        "C": pred_processed_col_c,
    })

    assert pred_out_df.equals(pred_expected_df)
Exemplo n.º 11
0
dataset_transformed = preprocessor.fit_transform(dataset)
print(dataset_transformed.take())
# [{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}, {'value': 0.5}]
# __chain_end__

# __custom_stateless_start__
import ray
from ray.data.preprocessors import BatchMapper

# Generate a simple dataset.
dataset = ray.data.range_table(4)
print(dataset.take())
# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}]

# Create a stateless preprocess that multiplies values by 2.
preprocessor = BatchMapper(lambda df: df * 2)
dataset_transformed = preprocessor.transform(dataset)
print(dataset_transformed.take())
# [{'value': 0}, {'value': 2}, {'value': 4}, {'value': 6}]
# __custom_stateless_end__

# __custom_stateful_start__
from typing import Dict
import ray
from pandas import DataFrame
from ray.data.preprocessors import CustomStatefulPreprocessor
from ray.data import Dataset
from ray.data.aggregate import Max


def get_max(ds: Dataset):
Exemplo n.º 12
0

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    User Pytorch code to transform user image. Note we still use pandas as
    intermediate format to hold images as shorthand of python dictionary.
    """
    preprocess = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )
    df["image"] = TensorArray([preprocess(x.to_numpy()) for x in df["image"]])
    return df


data_url = "s3://anonymous@air-example-data-2/1G-image-data-synthetic-raw"
print(f"Running GPU batch prediction with 1GB data from {data_url}")
dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url])

model = resnet18(pretrained=True)

preprocessor = BatchMapper(preprocess)
ckpt = TorchCheckpoint.from_model(model=model, preprocessor=preprocessor)

predictor = BatchPredictor.from_checkpoint(ckpt, TorchPredictor)
predictor.predict(dataset, feature_columns=["image"])