示例#1
0
def test_local_shuffle(ray_start_regular_shared):
    para_it = parallel_it.from_range(100).for_each(lambda x: [x])

    # batch_size larger than 1 and shuffle_buffer_size larger than 1
    ds = ml_data.from_parallel_iter(para_it, batch_size=10)
    ds1 = ds.local_shuffle(shuffle_buffer_size=5)
    ds2 = ds.local_shuffle(shuffle_buffer_size=5)

    l1 = list(ds1.gather_sync())
    l2 = list(ds2.gather_sync())
    assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2))

    # batch_size equals 1 and shuffle_buffer_size larger than 1
    ds = ml_data.from_parallel_iter(para_it, batch_size=1)
    ds1 = ds.local_shuffle(shuffle_buffer_size=5)
    ds2 = ds.local_shuffle(shuffle_buffer_size=5)

    l1 = list(ds1.gather_sync())
    l2 = list(ds2.gather_sync())
    assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2))

    # batch_size equals 1 and shuffle_buffer_size equals 1
    ds = ml_data.from_parallel_iter(para_it, batch_size=1)
    ds1 = ds.local_shuffle(shuffle_buffer_size=1)
    ds2 = ds.local_shuffle(shuffle_buffer_size=1)

    l1 = list(ds1.gather_sync())
    l2 = list(ds2.gather_sync())
    assert all(df1.equals(df2) for df1, df2 in zip(l1, l2))
示例#2
0
def _create_ml_dataset(name: str,
                       record_pieces: List[RecordPiece],
                       record_sizes: List[int],
                       num_shards: int,
                       shuffle: bool,
                       shuffle_seed: int,
                       RecordBatchCls,
                       node_hints: List[str] = None) -> MLDataset:
    if node_hints is not None:
        assert num_shards % len(node_hints) == 0,\
            f"num_shards: {num_shards} should be a multiple of length of node_hints: {node_hints}"
    if shuffle_seed:
        np.random.seed(shuffle_seed)
    else:
        np.random.seed(0)

    # split the piece into num_shards partitions
    divided_blocks = divide_blocks(blocks=record_sizes,
                                   world_size=num_shards,
                                   shuffle=shuffle,
                                   shuffle_seed=shuffle_seed)

    record_batches = []

    for rank, blocks in divided_blocks.items():
        pieces = []
        for index, num_samples in blocks:
            record_size = record_sizes[index]
            piece = record_pieces[index]
            if num_samples != record_size:
                assert num_samples < record_size
                new_row_ids = np.random.choice(
                    record_size, size=num_samples).tolist()
                piece = piece.with_row_ids(new_row_ids)
            pieces.append(piece)

        if shuffle:
            np.random.shuffle(pieces)
        record_batches.append(RecordBatchCls(shard_id=rank,
                                             prefix=name,
                                             record_pieces=pieces,
                                             shuffle=shuffle,
                                             shuffle_seed=shuffle_seed))

    worker_cls = ray.remote(ParallelIteratorWorkerWithLen)
    if node_hints is not None:
        actors = []
        multiplier = num_shards // len(node_hints)
        resource_keys = [f"node:{node_hints[i // multiplier]}" for i in range(num_shards)]
        for g, resource_key in zip(record_batches, resource_keys):
            actor = worker_cls.options(resources={resource_key: 0.01}).remote(g, False, len(g))
            actors.append(actor)
    else:
        worker_cls = ray.remote(ParallelIteratorWorkerWithLen)
        actors = [worker_cls.remote(g, False, len(g)) for g in record_batches]

    it = parallel_it.from_actors(actors, name)
    ds = ml_dataset.from_parallel_iter(
        it, need_convert=False, batch_size=0, repeated=False)
    return ds
示例#3
0
def test_tf_dataset(ray_start_4_cpus):  # noqa: F811
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)
    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    prediction = model.predict([0.5])[0][0]
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()
示例#4
0
def main():
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)

    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    print("f(0.5)=", float(model.predict([0.5])))
示例#5
0
def test_read_ray_mldataset(ray_start_regular):
    test_df1 = pd.DataFrame(
        {
            "a": np.arange(10).astype(np.int64, copy=False),
            "b": [f"s{i}" for i in range(10)],
        }
    )
    test_df2 = pd.DataFrame(
        {
            "a": np.arange(10).astype(np.int64, copy=False),
            "b": [f"s{i}" for i in range(10)],
        }
    )
    df = pd.concat([test_df1, test_df2])
    import ray.util.iter
    from ray.util.data import from_parallel_iter

    ml_dataset = from_parallel_iter(
        ray.util.iter.from_items([test_df1, test_df2], num_shards=2), need_convert=False
    )
    mdf = read_ray_mldataset(ml_dataset)

    assert mdf.shape[1] == 2
    pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas())
    pd.testing.assert_series_equal(df.dtypes, mdf.dtypes)

    mdf = tile(mdf)
    assert len(mdf.chunks) == 2
    for chunk in mdf.chunks:
        assert isinstance(chunk.op, DataFrameReadMLDataset)
示例#6
0
def test_from_parallel_it(ray_start_regular_shared):
    para_it = parallel_it.from_range(4).for_each(lambda x: [x])
    ds = ml_data.from_parallel_iter(para_it, batch_size=2)
    assert repr(ds) == ("MLDataset[from_range[4, shards=2]"
                        ".for_each().batch(2).to_pandas()]")
    collected = list(ds.gather_sync())
    assert len(collected) == 2
    assert all(d.shape == (2, 1) for d in collected)
    expected = para_it.flatten().batch(2).gather_sync().flatten()
    flattened = ds.gather_sync().for_each(lambda x: x[0].to_list()).flatten()
    assert list(flattened) == list(expected)
示例#7
0
def test_union(ray_start_regular_shared):
    para_it1 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
    ds1 = ml_data.from_parallel_iter(para_it1, True, 2, False)
    para_it2 = parallel_it.from_range(4, 2, True).for_each(lambda x: [x])
    ds2 = ml_data.from_parallel_iter(para_it2, True, 2, True)

    with pytest.raises(TypeError) as ex:
        ds1.union(ds2)
    assert "two MLDataset which have different repeated type" in str(ex.value)

    # union two MLDataset with same batch size
    para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
    ds2 = ml_data.from_parallel_iter(para_it2, True, 2, False)
    ds = ds1.union(ds2)
    assert ds.batch_size == 2

    # union two MLDataset with different batch size
    para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
    ds2 = ml_data.from_parallel_iter(para_it2, True, 1, False)
    ds = ds1.union(ds2)
    # batch_size 0 means batch_size unknown
    assert ds.batch_size == 0
def main():
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    torch_ds = ds.to_torch(feature_columns=[0], label_column=1)

    trainer = TorchTrainer(
        num_workers=2,
        training_operator_cls=make_train_operator(torch_ds),
        add_dist_sampler=False,
        config={"batch_size": 32})
    for i in range(10):
        trainer.train(num_steps=100)
        model = trainer.get_model()
        print("f(0.5)=", float(model(torch.tensor([[0.5]]).float())[0][0]))
示例#9
0
def test_torch_dataset(ray_start_4_cpus, use_local):
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    para_it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    ds = ml_data.from_parallel_iter(para_it, batch_size=32)

    torch_ds = ds.to_torch(feature_columns=[0], label_column=1)
    operator = make_train_operator(torch_ds)
    trainer = TorchTrainer(training_operator_cls=operator,
                           num_workers=2,
                           use_local=use_local,
                           add_dist_sampler=False,
                           config={"batch_size": 32})
    for i in range(10):
        trainer.train(num_steps=100)

    model = trainer.get_model()
    prediction = float(model(torch.tensor([[0.5]]).float())[0][0])
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()
示例#10
0
def create_ml_dataset_from_spark(
        df: sql.DataFrame,
        num_shards: int,
        batch_size: int,
        fs_directory: Optional[str] = None,
        compression: Optional[str] = None) -> MLDataset:
    """ Create a MLDataset from Spark DataFrame

    This method will create a MLDataset from Spark DataFrame.

    :param df: the pyspark.sql.DataFrame
    :param num_shards: the number of shards will be created for the MLDataset
    :param batch_size: the batch size for the MLDataset
    :param fs_directory: an optional distributed file system directory for cache the
           DataFrame. We will write the DataFrame to the given directory with parquet
           format if this is provided. Otherwise, we will write the DataFrame to ray
           object store.
    :param compression: the optional compression for write the DataFrame as parquet
           file. This is only useful when the fs_directory set.
    :return: a MLDataset
    """
    df = df.repartition(num_shards)
    if fs_directory is None:
        # fs_directory has not provided, we save the Spark DataFrame to ray object store
        record_batch_set = _save_spark_df_to_object_store(df, num_shards)
        # TODO: we should specify the resource spec for each shard
        it = parallel_it.from_iterators(generators=record_batch_set,
                                        name="Spark DataFrame",
                                        repeat=False)
        ds = ml_dataset.from_parallel_iter(it,
                                           need_convert=False,
                                           batch_size=batch_size,
                                           repeated=False)
        return ds
    else:
        # fs_directory has provided, we write the Spark DataFrame as Parquet files
        df.write.parquet(fs_directory, compression=compression)
        # create the MLDataset from the parquet file
        ds = ml_dataset.read_parquet(fs_directory, num_shards)
        return ds
示例#11
0
def test_read_ray_mldataset(ray_start_regular, ray_create_mars_cluster):
    test_dfs = [
        pd.DataFrame({
            "a":
            np.arange(i * 10, (i + 1) * 10).astype(np.int64, copy=False),
            "b": [f"s{j}" for j in range(i * 10, (i + 1) * 10)],
        }) for i in range(5)
    ]
    import ray.util.iter
    from ray.util.data import from_parallel_iter

    ml_dataset = from_parallel_iter(ray.util.iter.from_items(test_dfs,
                                                             num_shards=4),
                                    need_convert=False)
    dfs = []
    for shard in ml_dataset.shards():
        dfs.extend(list(shard))
    df = pd.concat(dfs).reset_index(drop=True)
    mdf = md.read_ray_mldataset(ml_dataset)
    pd.testing.assert_frame_equal(df, mdf.execute().fetch())
    pd.testing.assert_frame_equal(df.head(5), mdf.head(5).execute().fetch())
    pd.testing.assert_frame_equal(df.head(15), mdf.head(15).execute().fetch())