Python from_parallel_iter示例，ray.util.data.from_parallel_iter Python示例

示例#1

0

显示文件

文件： test_mldataset.py 项目： wuisawesome/ray

def test_local_shuffle(ray_start_regular_shared):
    para_it = parallel_it.from_range(100).for_each(lambda x: [x])

    # batch_size larger than 1 and shuffle_buffer_size larger than 1
    ds = ml_data.from_parallel_iter(para_it, batch_size=10)
    ds1 = ds.local_shuffle(shuffle_buffer_size=5)
    ds2 = ds.local_shuffle(shuffle_buffer_size=5)

    l1 = list(ds1.gather_sync())
    l2 = list(ds2.gather_sync())
    assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2))

    # batch_size equals 1 and shuffle_buffer_size larger than 1
    ds = ml_data.from_parallel_iter(para_it, batch_size=1)
    ds1 = ds.local_shuffle(shuffle_buffer_size=5)
    ds2 = ds.local_shuffle(shuffle_buffer_size=5)

    l1 = list(ds1.gather_sync())
    l2 = list(ds2.gather_sync())
    assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2))

    # batch_size equals 1 and shuffle_buffer_size equals 1
    ds = ml_data.from_parallel_iter(para_it, batch_size=1)
    ds1 = ds.local_shuffle(shuffle_buffer_size=1)
    ds2 = ds.local_shuffle(shuffle_buffer_size=1)

    l1 = list(ds1.gather_sync())
    l2 = list(ds2.gather_sync())
    assert all(df1.equals(df2) for df1, df2 in zip(l1, l2))

示例#2

0

显示文件

文件： dataset.py 项目： oap-project/raydp

def _create_ml_dataset(name: str,
                       record_pieces: List[RecordPiece],
                       record_sizes: List[int],
                       num_shards: int,
                       shuffle: bool,
                       shuffle_seed: int,
                       RecordBatchCls,
                       node_hints: List[str] = None) -> MLDataset:
    if node_hints is not None:
        assert num_shards % len(node_hints) == 0,\
            f"num_shards: {num_shards} should be a multiple of length of node_hints: {node_hints}"
    if shuffle_seed:
        np.random.seed(shuffle_seed)
    else:
        np.random.seed(0)

    # split the piece into num_shards partitions
    divided_blocks = divide_blocks(blocks=record_sizes,
                                   world_size=num_shards,
                                   shuffle=shuffle,
                                   shuffle_seed=shuffle_seed)

    record_batches = []

    for rank, blocks in divided_blocks.items():
        pieces = []
        for index, num_samples in blocks:
            record_size = record_sizes[index]
            piece = record_pieces[index]
            if num_samples != record_size:
                assert num_samples < record_size
                new_row_ids = np.random.choice(
                    record_size, size=num_samples).tolist()
                piece = piece.with_row_ids(new_row_ids)
            pieces.append(piece)

        if shuffle:
            np.random.shuffle(pieces)
        record_batches.append(RecordBatchCls(shard_id=rank,
                                             prefix=name,
                                             record_pieces=pieces,
                                             shuffle=shuffle,
                                             shuffle_seed=shuffle_seed))

    worker_cls = ray.remote(ParallelIteratorWorkerWithLen)
    if node_hints is not None:
        actors = []
        multiplier = num_shards // len(node_hints)
        resource_keys = [f"node:{node_hints[i // multiplier]}" for i in range(num_shards)]
        for g, resource_key in zip(record_batches, resource_keys):
            actor = worker_cls.options(resources={resource_key: 0.01}).remote(g, False, len(g))
            actors.append(actor)
    else:
        worker_cls = ray.remote(ParallelIteratorWorkerWithLen)
        actors = [worker_cls.remote(g, False, len(g)) for g in record_batches]

    it = parallel_it.from_actors(actors, name)
    ds = ml_dataset.from_parallel_iter(
        it, need_convert=False, batch_size=0, repeated=False)
    return ds

示例#3

0

显示文件

文件： test_tensorflow.py 项目： wuisawesome/ray

def test_tf_dataset(ray_start_4_cpus):  # noqa: F811
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)
    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    prediction = model.predict([0.5])[0][0]
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()

示例#4

0

显示文件

文件： mlp_identity_tf.py 项目： wuisawesome/ray

def main():
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)

    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    print("f(0.5)=", float(model.predict([0.5])))

示例#5

0

显示文件

def test_read_ray_mldataset(ray_start_regular):
    test_df1 = pd.DataFrame(
        {
            "a": np.arange(10).astype(np.int64, copy=False),
            "b": [f"s{i}" for i in range(10)],
        }
    )
    test_df2 = pd.DataFrame(
        {
            "a": np.arange(10).astype(np.int64, copy=False),
            "b": [f"s{i}" for i in range(10)],
        }
    )
    df = pd.concat([test_df1, test_df2])
    import ray.util.iter
    from ray.util.data import from_parallel_iter

    ml_dataset = from_parallel_iter(
        ray.util.iter.from_items([test_df1, test_df2], num_shards=2), need_convert=False
    )
    mdf = read_ray_mldataset(ml_dataset)

    assert mdf.shape[1] == 2
    pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas())
    pd.testing.assert_series_equal(df.dtypes, mdf.dtypes)

    mdf = tile(mdf)
    assert len(mdf.chunks) == 2
    for chunk in mdf.chunks:
        assert isinstance(chunk.op, DataFrameReadMLDataset)

示例#6

0

显示文件

文件： test_mldataset.py 项目： wuisawesome/ray

def test_from_parallel_it(ray_start_regular_shared):
    para_it = parallel_it.from_range(4).for_each(lambda x: [x])
    ds = ml_data.from_parallel_iter(para_it, batch_size=2)
    assert repr(ds) == ("MLDataset[from_range[4, shards=2]"
                        ".for_each().batch(2).to_pandas()]")
    collected = list(ds.gather_sync())
    assert len(collected) == 2
    assert all(d.shape == (2, 1) for d in collected)
    expected = para_it.flatten().batch(2).gather_sync().flatten()
    flattened = ds.gather_sync().for_each(lambda x: x[0].to_list()).flatten()
    assert list(flattened) == list(expected)

示例#7

0

显示文件

文件： test_mldataset.py 项目： wuisawesome/ray

def test_union(ray_start_regular_shared):
    para_it1 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
    ds1 = ml_data.from_parallel_iter(para_it1, True, 2, False)
    para_it2 = parallel_it.from_range(4, 2, True).for_each(lambda x: [x])
    ds2 = ml_data.from_parallel_iter(para_it2, True, 2, True)

    with pytest.raises(TypeError) as ex:
        ds1.union(ds2)
    assert "two MLDataset which have different repeated type" in str(ex.value)

    # union two MLDataset with same batch size
    para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
    ds2 = ml_data.from_parallel_iter(para_it2, True, 2, False)
    ds = ds1.union(ds2)
    assert ds.batch_size == 2

    # union two MLDataset with different batch size
    para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
    ds2 = ml_data.from_parallel_iter(para_it2, True, 1, False)
    ds = ds1.union(ds2)
    # batch_size 0 means batch_size unknown
    assert ds.batch_size == 0

示例#8

0

显示文件

文件： mlp_identity_torch.py 项目： amrit-dev-20/Dev-Training-Ray

def main():
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    torch_ds = ds.to_torch(feature_columns=[0], label_column=1)

    trainer = TorchTrainer(
        num_workers=2,
        training_operator_cls=make_train_operator(torch_ds),
        add_dist_sampler=False,
        config={"batch_size": 32})
    for i in range(10):
        trainer.train(num_steps=100)
        model = trainer.get_model()
        print("f(0.5)=", float(model(torch.tensor([[0.5]]).float())[0][0]))

示例#9

0

显示文件

文件： test_torch_3.py 项目： weileze/ray

def test_torch_dataset(ray_start_4_cpus, use_local):
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    para_it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    ds = ml_data.from_parallel_iter(para_it, batch_size=32)

    torch_ds = ds.to_torch(feature_columns=[0], label_column=1)
    operator = make_train_operator(torch_ds)
    trainer = TorchTrainer(training_operator_cls=operator,
                           num_workers=2,
                           use_local=use_local,
                           add_dist_sampler=False,
                           config={"batch_size": 32})
    for i in range(10):
        trainer.train(num_steps=100)

    model = trainer.get_model()
    prediction = float(model(torch.tensor([[0.5]]).float())[0][0])
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()

示例#10

0

显示文件

def create_ml_dataset_from_spark(
        df: sql.DataFrame,
        num_shards: int,
        batch_size: int,
        fs_directory: Optional[str] = None,
        compression: Optional[str] = None) -> MLDataset:
    """ Create a MLDataset from Spark DataFrame

    This method will create a MLDataset from Spark DataFrame.

    :param df: the pyspark.sql.DataFrame
    :param num_shards: the number of shards will be created for the MLDataset
    :param batch_size: the batch size for the MLDataset
    :param fs_directory: an optional distributed file system directory for cache the
           DataFrame. We will write the DataFrame to the given directory with parquet
           format if this is provided. Otherwise, we will write the DataFrame to ray
           object store.
    :param compression: the optional compression for write the DataFrame as parquet
           file. This is only useful when the fs_directory set.
    :return: a MLDataset
    """
    df = df.repartition(num_shards)
    if fs_directory is None:
        # fs_directory has not provided, we save the Spark DataFrame to ray object store
        record_batch_set = _save_spark_df_to_object_store(df, num_shards)
        # TODO: we should specify the resource spec for each shard
        it = parallel_it.from_iterators(generators=record_batch_set,
                                        name="Spark DataFrame",
                                        repeat=False)
        ds = ml_dataset.from_parallel_iter(it,
                                           need_convert=False,
                                           batch_size=batch_size,
                                           repeated=False)
        return ds
    else:
        # fs_directory has provided, we write the Spark DataFrame as Parquet files
        df.write.parquet(fs_directory, compression=compression)
        # create the MLDataset from the parquet file
        ds = ml_dataset.read_parquet(fs_directory, num_shards)
        return ds

示例#11

0

显示文件

文件： test_datasource_execution.py 项目： wjsi/mars

def test_read_ray_mldataset(ray_start_regular, ray_create_mars_cluster):
    test_dfs = [
        pd.DataFrame({
            "a":
            np.arange(i * 10, (i + 1) * 10).astype(np.int64, copy=False),
            "b": [f"s{j}" for j in range(i * 10, (i + 1) * 10)],
        }) for i in range(5)
    ]
    import ray.util.iter
    from ray.util.data import from_parallel_iter

    ml_dataset = from_parallel_iter(ray.util.iter.from_items(test_dfs,
                                                             num_shards=4),
                                    need_convert=False)
    dfs = []
    for shard in ml_dataset.shards():
        dfs.extend(list(shard))
    df = pd.concat(dfs).reset_index(drop=True)
    mdf = md.read_ray_mldataset(ml_dataset)
    pd.testing.assert_frame_equal(df, mdf.execute().fetch())
    pd.testing.assert_frame_equal(df.head(5), mdf.head(5).execute().fetch())
    pd.testing.assert_frame_equal(df.head(15), mdf.head(15).execute().fetch())