Пример #1
0
def test_metrics_union(ray_start_regular_shared):
    it1 = from_items([1, 2, 3, 4], num_shards=1)
    it2 = from_items([1, 2, 3, 4], num_shards=1)

    def foo_metrics(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["foo"] += x
        return metrics.counters["foo"]

    def bar_metrics(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["bar"] += 100
        return metrics.counters["bar"]

    def verify_metrics(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["n"] += 1
        # Check the metrics context is shared.
        if metrics.counters["n"] >= 2:
            assert "foo" in metrics.counters
            assert "bar" in metrics.counters
        return x

    it1 = it1.gather_async().for_each(foo_metrics)
    it2 = it2.gather_async().for_each(bar_metrics)
    it3 = it1.union(it2, deterministic=True)
    it3 = it3.for_each(verify_metrics)
    assert it3.take(10) == [1, 100, 3, 200, 6, 300, 10, 400]
Пример #2
0
def test_union(ray_start_regular_shared):
    it1 = from_items(["a", "b", "c"], 1)
    it2 = from_items(["x", "y", "z"], 1)
    it = it1.union(it2)
    assert (repr(it) == "ParallelIterator[ParallelUnion[ParallelIterator["
            "from_items[str, 3, shards=1]], ParallelIterator["
            "from_items[str, 3, shards=1]]]]")
    assert list(it.gather_sync()) == ["a", "x", "b", "y", "c", "z"]
Пример #3
0
def test_metrics(ray_start_regular_shared):
    it = from_items([1, 2, 3, 4], num_shards=1)
    it2 = from_items([1, 2, 3, 4], num_shards=1)

    def f(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["foo"] += x
        return metrics.counters["foo"]

    it = it.gather_sync().for_each(f)
    it2 = it2.gather_sync().for_each(f)

    # Tests iterators have isolated contexts.
    assert it.take(4) == [1, 3, 6, 10]
    assert it2.take(4) == [1, 3, 6, 10]
Пример #4
0
def test_local_shuffle(ray_start_regular_shared):
    # confirm that no data disappears, and they all stay within the same shard
    it = from_range(8, num_shards=2).local_shuffle(shuffle_buffer_size=2)
    assert repr(it) == ("ParallelIterator[from_range[8, shards=2]" +
                        ".local_shuffle(shuffle_buffer_size=2, seed=None)]")
    shard_0 = it.get_shard(0)
    shard_1 = it.get_shard(1)
    assert set(shard_0) == {0, 1, 2, 3}
    assert set(shard_1) == {4, 5, 6, 7}

    # check that shuffling results in different orders
    it1 = from_range(100, num_shards=10).local_shuffle(shuffle_buffer_size=5)
    it2 = from_range(100, num_shards=10).local_shuffle(shuffle_buffer_size=5)
    assert list(it1.gather_sync()) != list(it2.gather_sync())

    # buffer size of 1 should not result in any shuffling
    it3 = from_range(10, num_shards=1).local_shuffle(shuffle_buffer_size=1)
    assert list(it3.gather_sync()) == list(range(10))

    # statistical test
    it4 = from_items([0, 1] * 10000,
                     num_shards=1).local_shuffle(shuffle_buffer_size=100)
    result = "".join(it4.gather_sync().for_each(str))
    freq_counter = Counter(zip(result[:-1], result[1:]))
    assert len(freq_counter) == 4
    for key, value in freq_counter.items():
        assert value / len(freq_counter) > 0.2
Пример #5
0
def main():
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)

    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    print("f(0.5)=", float(model.predict([0.5])))
Пример #6
0
def test_for_each_concur_sync(ray_start_regular_shared):
    main_wait = Semaphore.remote(value=0)
    test_wait = Semaphore.remote(value=0)

    def task(x):
        i, main_wait, test_wait = x
        ray.get(main_wait.release.remote())
        ray.get(test_wait.acquire.remote())
        return i + 10

    @ray.remote(num_cpus=0.01)
    def to_list(it):
        return list(it)

    it = from_items([(i, main_wait, test_wait) for i in range(8)],
                    num_shards=2)
    it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.01})

    list_promise = to_list.remote(it.gather_sync())

    for i in range(4):
        assert i in [0, 1, 2, 3]
        ray.get(main_wait.acquire.remote())

    # There should be exactly 4 tasks executing at this point.
    assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism"

    for i in range(8):
        ray.get(test_wait.release.remote())

    assert repr(
        it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]"
    result_list = ray.get(list_promise)
    assert set(result_list) == set(range(10, 18))
Пример #7
0
def test_tf_dataset(ray_start_4_cpus):  # noqa: F811
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)
    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    prediction = model.predict([0.5])[0][0]
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()
Пример #8
0
def test_zip_with_source_actor(ray_start_regular_shared):
    it = from_items([1, 2, 3, 4], num_shards=2)
    counts = collections.defaultdict(int)
    for actor, value in it.gather_async().zip_with_source_actor():
        counts[actor] += 1
    assert len(counts) == 2
    for a, count in counts.items():
        assert count == 2
Пример #9
0
def test_serialization(ray_start_regular_shared):
    it = (from_items([1, 2, 3, 4]).gather_sync().for_each(lambda x: x).filter(
        lambda x: True).batch(2).flatten())
    assert (repr(it) == "LocalIterator[ParallelIterator["
            "from_items[int, 4, shards=2]].gather_sync()."
            "for_each().filter().batch(2).flatten()]")

    @ray.remote
    def get(it):
        return list(it)

    assert ray.get(get.remote(it)) == [1, 2, 3, 4]
Пример #10
0
def test_metrics_union_recursive(ray_start_regular_shared):
    it1 = from_items([1, 2, 3, 4], num_shards=1)
    it2 = from_items([1, 2, 3, 4], num_shards=1)
    it3 = from_items([1, 2, 3, 4], num_shards=1)

    def foo_metrics(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["foo"] += 1
        return metrics.counters["foo"]

    def bar_metrics(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["bar"] += 1
        return metrics.counters["bar"]

    def baz_metrics(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["baz"] += 1
        return metrics.counters["baz"]

    def verify_metrics(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["n"] += 1
        # Check the metrics context is shared recursively.
        print(metrics.counters)
        if metrics.counters["n"] >= 3:
            assert "foo" in metrics.counters
            assert "bar" in metrics.counters
            assert "baz" in metrics.counters
        return x

    it1 = it1.gather_async().for_each(foo_metrics)
    it2 = it2.gather_async().for_each(bar_metrics)
    it3 = it3.gather_async().for_each(baz_metrics)
    it12 = it1.union(it2, deterministic=True)
    it123 = it12.union(it3, deterministic=True)
    out = it123.for_each(verify_metrics)
    assert out.take(20) == [1, 1, 1, 2, 2, 3, 2, 4, 3, 3, 4, 4]
Пример #11
0
def test_metrics(ray_start_regular_shared):
    it = from_items([1, 2, 3, 4], num_shards=1)
    it2 = from_items([1, 2, 3, 4], num_shards=1)

    def f(x):
        metrics = LocalIterator.get_metrics()
        metrics.counters["foo"] += x
        return metrics.counters["foo"]

    it = it.gather_sync().for_each(f)
    it2 = it2.gather_sync().for_each(f)

    # Context cannot be accessed outside the iterator.
    with pytest.raises(ValueError):
        LocalIterator.get_metrics()

    # Tests iterators have isolated contexts.
    assert it.take(4) == [1, 3, 6, 10]
    assert it2.take(4) == [1, 3, 6, 10]

    # Context cannot be accessed outside the iterator.
    with pytest.raises(ValueError):
        LocalIterator.get_metrics()
def main():
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    torch_ds = ds.to_torch(feature_columns=[0], label_column=1)

    trainer = TorchTrainer(
        num_workers=2,
        training_operator_cls=make_train_operator(torch_ds),
        add_dist_sampler=False,
        config={"batch_size": 32})
    for i in range(10):
        trainer.train(num_steps=100)
        model = trainer.get_model()
        print("f(0.5)=", float(model(torch.tensor([[0.5]]).float())[0][0]))
Пример #13
0
    def from_modin(cls, df, num_shards: int = 2):
        """Create a MLDataset from a Modin Dataframe.

        Args:
            df (modin.pandas.DataFrame): A Modin Dataframe.
            num_shards (int): The number of worker actors to create.
        """
        try:
            import modin.pandas as pd
        except ImportError:
            raise ImportError("Cannot convert from Modin because "
                              "Modin is not installed.") from None
        if not isinstance(df, (pd.DataFrame, pd.Series)):
            raise ValueError("Must provide a modin.pandas DataFrame or Series")
        from modin.distributed.dataframe.pandas.partitions import unwrap_partitions

        parts = unwrap_partitions(df)
        modin_iter = from_items(parts, num_shards=num_shards, repeat=False)
        return cls.from_parallel_it(modin_iter, batch_size=0, repeated=False)
Пример #14
0
def test_torch_dataset(ray_start_4_cpus, use_local):
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    para_it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    ds = ml_data.from_parallel_iter(para_it, batch_size=32)

    torch_ds = ds.to_torch(feature_columns=[0], label_column=1)
    operator = make_train_operator(torch_ds)
    trainer = TorchTrainer(training_operator_cls=operator,
                           num_workers=2,
                           use_local=use_local,
                           add_dist_sampler=False,
                           config={"batch_size": 32})
    for i in range(10):
        trainer.train(num_steps=100)

    model = trainer.get_model()
    prediction = float(model(torch.tensor([[0.5]]).float())[0][0])
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()
Пример #15
0
def process_data(data_set_type: str, parallel=True):
    files = recurse_files(path.join("./data/webnlg", "raw", data_set_type))
    xml_objs = [parse_xml_file(f) for f in files]

    entries = []
    if not parallel:
        print(f"[Info] Processing data...")

        chunks = [RDFFileReader(x).data for x in xml_objs]
        entries = flatten_list(chunks)

    else:
        num_shards: int = num_cpus if parallel else 1
        print(f"[Info] Processing data in {num_shards} shards...")

        iterator = (
            pariter.from_items(xml_objs[0:5], num_shards=num_shards)
            # .for_each(lambda f: cleaner.clean)
            .for_each(lambda xmldata: RDFFileReader(xmldata).data).flatten())

        entries = iterator.gather_async()

    return tqdm(entries, desc="WebNLG", unit="entry")
Пример #16
0
def test_for_each_concur(ray_start_regular_shared):
    main_wait = Semaphore.remote(value=0)
    test_wait = Semaphore.remote(value=0)

    def task(x):
        i, main_wait, test_wait = x
        ray.get(main_wait.release.remote())
        ray.get(test_wait.acquire.remote())
        return i + 10

    @ray.remote(num_cpus=0.1)
    def to_list(it):
        return list(it)

    it = from_items([(i, main_wait, test_wait) for i in range(8)],
                    num_shards=2)
    it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.1})

    for i in range(4):
        ray.get(main_wait.acquire.remote())

    # There should be exactly 4 tasks executing at this point.
    assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism"

    # When we finish one task, exactly one more should start.
    ray.get(test_wait.release.remote())
    ray.get(main_wait.acquire.remote())
    assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism"

    # Finish everything and make sure the output matches a regular iterator.
    for i in range(3):
        ray.get(test_wait.release.remote())

    assert repr(
        it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]"
    assert ray.get(to_list.remote(it.gather_sync())) == list(range(10, 18))
Пример #17
0
def test_from_items_repeat(ray_start_regular_shared):
    it = from_items([1, 2, 3, 4], repeat=True)
    assert repr(
        it) == "ParallelIterator[from_items[int, 4, shards=2, repeat=True]]"
    assert it.take(8) == [1, 2, 3, 4, 1, 2, 3, 4]
Пример #18
0
def test_select_shards(ray_start_regular_shared):
    it = from_items([1, 2, 3, 4], num_shards=4)
    it1 = it.select_shards([0, 2])
    it2 = it.select_shards([1, 3])
    assert it1.take(4) == [1, 3]
    assert it2.take(4) == [2, 4]
Пример #19
0
def test_union_local(ray_start_regular_shared):
    it1 = from_items(["a", "b", "c"], 1).gather_async()
    it2 = from_range(5, 2).for_each(str).gather_async()
    it = it1.union(it2)
    assert sorted(it) == ["0", "1", "2", "3", "4", "a", "b", "c"]
Пример #20
0
def test_from_items(ray_start_regular_shared):
    it = from_items([1, 2, 3, 4])
    assert repr(it) == "ParallelIterator[from_items[int, 4, shards=2]]"
    assert list(it.gather_sync()) == [1, 2, 3, 4]
    assert next(it.gather_sync()) == 1
Пример #21
0
def test_flatten(ray_start_regular_shared):
    it = from_items([[1, 2], [3, 4]], 1).flatten()
    assert repr(
        it) == "ParallelIterator[from_items[list, 2, shards=1].flatten()]"
    assert list(it.gather_sync()) == [1, 2, 3, 4]