示例#1
0
def test_reduce_result(ray_start_2_cpus, use_local):
    if not dist.is_available():
        return

    def data_creator(config):
        """Returns training dataloader, validation dataloader."""
        train_dataset = LinearDataset(2, 5, size=config["data_size"])
        test_dataset = LinearDataset(2, 5, size=config["data_size"])
        return DataLoader(
            train_dataset, batch_size=1), DataLoader(
                test_dataset, batch_size=1)

    data_size = 600

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        data_creator,
        loss_creator=lambda config: nn.MSELoss())
    trainer = TorchTrainer(
        training_operator_cls=TestOperator,
        num_workers=2,
        use_local=use_local,
        config={"data_size": data_size})
    list_stats = trainer.train(reduce_results=False, profile=True)
    assert len(list_stats) == 2
    assert [stats[NUM_SAMPLES] == data_size for stats in list_stats]
    assert [stats[BATCH_COUNT] == (data_size // 2) for stats in list_stats]
    list_stats = trainer.validate(reduce_results=False, profile=True)
    assert len(list_stats) == 2
    assert [stats[NUM_SAMPLES] == data_size for stats in list_stats]
    assert [stats[BATCH_COUNT] == (data_size // 2) for stats in list_stats]
    trainer.shutdown()
示例#2
0
def test_fail_with_recover(ray_start_2_cpus, use_local):  # noqa: F811
    print(locals())
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())

    start_with_fail = gen_start_with_fail(3)

    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                timeout_s=5,
                                use_local=use_local,
                                num_workers=2)

        with pytest.raises(RuntimeError):
            trainer1.train(max_retries=1)

        trainer1.shutdown(force=True)
示例#3
0
def test_split_batch(ray_start_2_cpus, use_local):
    if not dist.is_available():
        return

    def data_creator(config):
        """Returns training dataloader, validation dataloader."""
        train_dataset = LinearDataset(2, 5, size=config["data_size"])
        return DataLoader(
            train_dataset,
            batch_size=config[BATCH_SIZE],
        )

    data_size = 600
    batch_size = 21
    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        data_creator,
        loss_creator=lambda config: nn.MSELoss())
    trainer = TorchTrainer(
        training_operator_cls=TestOperator,
        num_workers=2,
        use_local=use_local,
        config={
            BATCH_SIZE: batch_size,
            "data_size": data_size,
        })
    stats = trainer.train()
    assert trainer.config[BATCH_SIZE] == (batch_size - 1)
    assert stats[NUM_SAMPLES] == 600
    assert stats[BATCH_COUNT] == (data_size // 20)
    trainer.shutdown()
示例#4
0
def test_resize(ray_start_2_cpus, use_local):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    start_with_fail = gen_start_with_fail(1)

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())
    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                use_local=use_local,
                                num_workers=2)

        @ray.remote
        def try_test():
            import time
            time.sleep(100)

        try_test.remote()
        trainer1.train(max_retries=1)
        assert trainer1.worker_group.num_workers == 1

        trainer1.shutdown(force=True)
示例#5
0
def test_dataset(ray_start_4_cpus, use_local):
    """
    This test tries training the mlp_identity example. We check the accuracy of
    the model as an all inclusive way of ensuring that we are properly sharding
    and iterating over the entire dataset (instead of repeating the first set
    of points for example).
    """

    model_creator = mlp_identity.model_creator
    optimizer_creator = mlp_identity.optimizer_creator
    dataset_creator = mlp_identity.dataset_creator

    DatasetOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        loss_creator=nn.MSELoss)

    trainer = TorchTrainer(
        training_operator_cls=DatasetOperator,
        use_local=use_local,
        num_workers=2,
    )

    dataset = dataset_creator()
    for i in range(5):
        trainer.train(dataset=dataset, num_steps=100)

    x = mlp_identity.to_mat(0.5)
    prediction = float(trainer.get_model()(x)[0][0])
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()
示例#6
0
def test_failure_during_resize(ray_start_2_cpus):  # noqa: F811
    """Tests if training succeeds even with failures during worker resizing."""
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())

    start_with_fail = gen_start_with_startup_fail(1)
    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                timeout_s=5,
                                use_local=False,
                                num_workers=2)
        trainer1.train()

    trainer1.shutdown()
示例#7
0
def test_fail_twice(ray_start_2_cpus, use_local):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())

    start_with_fail = gen_start_with_fail(2)

    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                use_local=use_local,
                                num_workers=2)

        # MAX RETRIES SHOULD BE ON BY DEFAULT
        trainer1.train()
        assert trainer1._num_failures == 2
        assert trainer1.worker_group.num_workers == 2
        trainer1.shutdown(force=True)
示例#8
0
 def setUp(self):
     self.Operator = TrainingOperator.from_creators(
         model_creator,
         optimizer_creator,
         create_dataloaders,
         loss_creator=loss_creator,
     )
示例#9
0
def test_fail_state(ray_start_2_cpus):  # noqa: F811
    """Tests if state of training with failure is same as training without."""
    if not dist.is_available():
        return

    torch.manual_seed(0)

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss(),
    )

    def init_hook():
        torch.manual_seed(0)

    trainer1 = TorchTrainer(
        training_operator_cls=TestOperator,
        config={"batch_size": 100000},
        timeout_s=5,
        initialization_hook=init_hook,
        num_workers=2,
    )
    initial_state = trainer1.state_dict()
    trainer1.train()
    trainer1_state = trainer1.state_dict()
    assert trainer1_state != initial_state
    trainer1.shutdown()

    trainer2 = TorchTrainer(
        training_operator_cls=TestOperator,
        config={"batch_size": 100000},
        timeout_s=5,
        initialization_hook=init_hook,
        num_workers=2,
    )
    trainer2.load_state_dict(initial_state)
    trainer2.train()
    assert trainer2.state_dict() == trainer1_state
    trainer2.shutdown()

    start_with_fail = gen_start_with_fail(1)
    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer3 = TorchTrainer(
            training_operator_cls=TestOperator,
            config={"batch_size": 100000},
            timeout_s=5,
            initialization_hook=init_hook,
            num_workers=2,
        )
        trainer3.load_state_dict(initial_state)
        trainer3.train()
        assert trainer3.state_dict() == trainer1_state
        trainer3.shutdown()
示例#10
0
 def testNativeLoss(self):
     NativeOperator = TrainingOperator.from_creators(
         model_creator,
         optimizer_creator,
         single_loader,
         loss_creator=nn.MSELoss)
     runner = TorchRunner(training_operator_cls=NativeOperator)
     runner.setup_operator()
     runner.train_epoch()
示例#11
0
 def testSingleLoader(self):
     SingleOperator = TrainingOperator.from_creators(
         model_creator,
         optimizer_creator,
         single_loader,
         loss_creator=loss_creator)
     runner = TorchRunner(training_operator_cls=SingleOperator)
     runner.setup_operator()
     runner.train_epoch()
     with self.assertRaises(ValueError):
         runner.validate()
示例#12
0
def test_resize(ray_start_2_cpus, use_local):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    start_with_fail = gen_start_with_fail(1)

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss(),
    )
    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer1 = TorchTrainer(
            training_operator_cls=TestOperator,
            config={"batch_size": 100000},
            use_local=use_local,
            num_workers=2,
        )

        # we use placement_group to occupy resources
        bundle = {
            "CPU": 1,
        }
        bundles = [bundle]
        dummy_pg = ray.util.placement_group(bundles, strategy="SPREAD")

        trainer1.train(max_retries=1)
        assert trainer1.worker_group.num_workers == 1
        assert trainer1._num_failures == 1

        ray.util.remove_placement_group(dummy_pg)

        def is_placement_group_removed():
            table = ray.util.placement_group_table(dummy_pg)
            if "state" not in table:
                return False
            return table["state"] == "REMOVED"

        # wait for free resource
        wait_for_condition(is_placement_group_removed)

        # trigger scale up
        trainer1.train()
        assert trainer1.worker_group.num_workers == 2

        trainer1.shutdown(force=True)
示例#13
0
def test_multi_input_model(ray_start_2_cpus, use_local):
    def model_creator(config):
        class MultiInputModel(nn.Module):
            def __init__(self):
                super(MultiInputModel, self).__init__()
                self._fc1 = torch.nn.Linear(1, 1)
                self._fc2 = torch.nn.Linear(1, 1)

            def forward(self, x, y):
                return self._fc1(x) + self._fc2(y)

        return MultiInputModel()

    def data_creator(config):
        class LinearDataset(torch.utils.data.Dataset):
            def __init__(self, a, b, size=1000):
                x = np.random.randn(size)
                y = np.random.randn(size)
                self.x = torch.tensor(x, dtype=torch.float32)
                self.y = torch.tensor(y, dtype=torch.float32)
                self.z = torch.tensor(a * (x + y) + 2 * b, dtype=torch.float32)

            def __getitem__(self, index):
                return (self.x[index, None], self.y[index, None], self.z[index,
                                                                         None])

            def __len__(self):
                return len(self.x)

        train_dataset = LinearDataset(3, 4)
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=config.get("batch_size", 32),
        )
        return train_loader, None

    Operator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        data_creator,
        loss_creator=lambda config: nn.MSELoss())

    trainer = TorchTrainer(training_operator_cls=Operator,
                           num_workers=1,
                           use_local=use_local)

    metrics = trainer.train(num_steps=1)
    assert metrics[BATCH_COUNT] == 1

    trainer.shutdown()
示例#14
0
    def testMultiLoaders(self):
        def three_data_loader(config):
            return (LinearDataset(2, 5), LinearDataset(2, 5, size=400),
                    LinearDataset(2, 5, size=400))

        ThreeOperator = TrainingOperator.from_creators(
            model_creator,
            optimizer_creator,
            three_data_loader,
            loss_creator=loss_creator)

        runner = TorchRunner(training_operator_cls=ThreeOperator)
        with self.assertRaises(ValueError):
            runner.setup_operator()

        runner2 = TorchRunner(training_operator_cls=ThreeOperator)
        with self.assertRaises(ValueError):
            runner2.setup_operator()
示例#15
0
def test_scheduler_validate(ray_start_2_cpus):  # noqa: F811
    from torch.optim.lr_scheduler import ReduceLROnPlateau

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        data_creator,
        scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer),
        loss_creator=lambda config: nn.MSELoss())
    TestOperator = get_test_operator(TestOperator)
    trainer = TorchTrainer(scheduler_step_freq="manual",
                           training_operator_cls=TestOperator)
    trainer.update_scheduler(0.5)
    trainer.update_scheduler(0.5)
    assert all(
        trainer.apply_all_operators(
            lambda op: op._schedulers[0].last_epoch == 2))
    trainer.shutdown()
示例#16
0
def test_resize(ray_start_2_cpus, use_local):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    start_with_fail = gen_start_with_fail(1)

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())
    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                use_local=use_local,
                                num_workers=2)

        @ray.remote(num_cpus=1)
        class DummyActor:
            def get(self):
                return 1

        dummy_handler = DummyActor.remote()
        trainer1.train(max_retries=1)
        assert trainer1.worker_group.num_workers == 1
        assert trainer1._num_failures == 1

        ray.get(dummy_handler.get.remote())
        ray.kill(dummy_handler)
        time.sleep(1)
        # trigger scale up
        trainer1.train()
        assert trainer1.worker_group.num_workers == 2

        trainer1.shutdown(force=True)
示例#17
0
def test_num_steps(ray_start_2_cpus, use_local):
    """Tests if num_steps continues training from the subsampled dataset."""

    def data_creator(config):
        train_dataset = [0] * 5 + [1] * 5
        val_dataset = [0] * 5 + [1] * 5
        return DataLoader(train_dataset, batch_size=config["batch_size"]), \
            DataLoader(val_dataset, batch_size=config["batch_size"])

    batch_size = 1
    Operator = TrainingOperator.from_creators(model_creator, optimizer_creator,
                                              data_creator)

    def train_func(self, iterator, info=None):
        total_sum = 0
        num_items = 0
        for e in iterator:
            total_sum += e
            num_items += 1
        return {"average": total_sum.item() / num_items}

    TestOperator = get_test_operator(Operator)
    trainer = TorchTrainer(
        training_operator_cls=TestOperator,
        num_workers=2,
        use_local=use_local,
        add_dist_sampler=False,
        config={
            "batch_size": batch_size,
            "custom_func": train_func
        })

    # If num_steps not passed, should do one full epoch.
    result = trainer.train()
    # Average of 5 0s and 5 1s
    assert result["average"] == 0.5
    assert result["epoch"] == 1
    val_result = trainer.validate()
    assert val_result["average"] == 0.5

    # Train again with num_steps.
    result = trainer.train(num_steps=5)
    # 5 zeros
    assert result["average"] == 0
    assert result["epoch"] == 2
    val_result = trainer.validate(num_steps=5)
    assert val_result["average"] == 0

    # Should continue where last train run left off.
    result = trainer.train(num_steps=3)
    # 3 ones.
    assert result["average"] == 1
    assert result["epoch"] == 2
    val_result = trainer.validate(num_steps=3)
    assert val_result["average"] == 1

    # Should continue from last train run, and cycle to beginning.
    result = trainer.train(num_steps=5)
    # 2 ones and 3 zeros.
    assert result["average"] == 0.4
    assert result["epoch"] == 3
    val_result = trainer.validate(num_steps=5)
    assert val_result["average"] == 0.4

    # Should continue, and since num_steps not passed in, just finishes epoch.
    result = trainer.train()
    # 2 zeros and 5 ones.
    assert result["average"] == 5 / 7
    assert result["epoch"] == 3
    val_result = trainer.validate()
    assert val_result["average"] == 5 / 7

    trainer.shutdown()
示例#18
0
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import DataLoader

import ray
from ray.util.sgd.torch import TorchTrainer
from ray.util.sgd.torch.worker_group import RemoteWorkerGroup
from ray.util.sgd.torch.training_operator import TrainingOperator

from ray.util.sgd.torch.examples.train_example import (model_creator,
                                                       optimizer_creator,
                                                       data_creator,
                                                       LinearDataset)

Operator = TrainingOperator.from_creators(model_creator,
                                          optimizer_creator,
                                          data_creator,
                                          loss_creator=nn.MSELoss)


@pytest.fixture
def ray_start_2_cpus():
    address_info = ray.init(num_cpus=2)
    yield address_info
    # The code after the yield will run as teardown code.
    ray.shutdown()
    # Ensure that tests don't ALL fail
    if dist.is_initialized():
        dist.destroy_process_group()


@pytest.fixture