def train_func(config: Dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)

    # Create model.
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    loss_results = []

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        train.report(loss=loss)
        loss_results.append(loss)

    return loss_results
Exemplo n.º 2
0
def train_func(config: Dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    device = torch.device(
        f"cuda:{train.local_rank()}" if torch.cuda.is_available() else "cpu")

    # Create data loaders.
    train_dataloader = DataLoader(training_data,
                                  batch_size=batch_size,
                                  sampler=DistributedSampler(training_data))
    test_dataloader = DataLoader(test_data,
                                 batch_size=batch_size,
                                 sampler=DistributedSampler(test_data))

    # Create model.
    model = NeuralNetwork()
    model = model.to(device)
    model = DistributedDataParallel(
        model,
        device_ids=[device.index] if torch.cuda.is_available() else None)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    loss_results = []

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer, device)
        loss = validate_epoch(test_dataloader, model, loss_fn, device)
        train.report(loss=loss)
        loss_results.append(loss)

    return loss_results
Exemplo n.º 3
0
def train_epochs_remote(config):
    '''
    This function will be run on each remote worker. It contains the epoch loop.
    '''
    train_dataset, val_dataset, model, loss_fn, optimizer = training_setup(
        config)

    batch_size = config.get('batch_size')
    epochs = config.get('epochs')

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=DistributedSampler(train_dataset))

    validation_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        sampler=DistributedSampler(val_dataset))

    # Prepare the data and the model for distributed training.
    train_loader = train.torch.prepare_data_loader(train_loader)
    validation_loader = train.torch.prepare_data_loader(validation_loader)
    model = train.torch.prepare_model(model)

    # epoch loop
    results = []
    for epoch in range(epochs):
        train_batches(train_loader, model, loss_fn, optimizer, config)
        result = validate_epoch(validation_loader, model, loss_fn)
        result['epoch'] = epoch + 1
        train.report(**result)
        results.append(result)

    return model.state_dict(), results
Exemplo n.º 4
0
def train_func(config):
    epochs = config.pop("epochs", 3)
    model = ResNet18(config)
    model = train.torch.prepare_model(model)

    # Create optimizer.
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.1),
        momentum=config.get("momentum", 0.9),
    )

    # Load in training and validation data.
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])  # meanstd transformation

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    with FileLock(".ray.lock"):
        train_dataset = CIFAR10(root="~/data",
                                train=True,
                                download=True,
                                transform=transform_train)
        validation_dataset = CIFAR10(root="~/data",
                                     train=False,
                                     download=False,
                                     transform=transform_test)

    if config.get("test_mode"):
        train_dataset = Subset(train_dataset, list(range(64)))
        validation_dataset = Subset(validation_dataset, list(range(64)))

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"])
    validation_loader = DataLoader(validation_dataset,
                                   batch_size=config["batch_size"])

    train_loader = train.torch.prepare_data_loader(train_loader)
    validation_loader = train.torch.prepare_data_loader(validation_loader)

    # Create loss.
    criterion = nn.CrossEntropyLoss()

    results = []

    for _ in range(epochs):
        train_epoch(train_loader, model, criterion, optimizer)
        result = validate_epoch(validation_loader, model, criterion)
        train.report(**result)
        results.append(result)

    return results
Exemplo n.º 5
0
def train_epochs_remote(config):
    '''
    This function will be run on a remote worker.
    '''
    train_dataset, val_dataset, model, loss_fn, optimizer = training_setup(config)

    batch_size = config.get("batch_size", 32)
    epochs = config.get("epochs", 3)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size)

    validation_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size)

    # Prepare the data and the model for distributed training.
    train_loader = train.torch.prepare_data_loader(train_loader)
    validation_loader = train.torch.prepare_data_loader(validation_loader)
    model = train.torch.prepare_model(model)
    #model = DistributedDataParallel(model)

    results = []
    for epoch in range(epochs):
        train_batches(train_loader, model, loss_fn, optimizer)
        result = validate_epoch(validation_loader, model, loss_fn)
        result['epoch'] = epoch + 1
        train.report(**result)
        results.append(result)

    return model.state_dict(), results
Exemplo n.º 6
0
def train_func(config):
    data_size = config.get("data_size", 1000)
    val_size = config.get("val_size", 400)
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset = LinearDataset(2, 5, size=data_size)
    val_dataset = LinearDataset(2, 5, size=val_size)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size)
    validation_loader = torch.utils.data.DataLoader(val_dataset,
                                                    batch_size=batch_size)

    train_loader = train.torch.prepare_data_loader(train_loader)
    validation_loader = train.torch.prepare_data_loader(validation_loader)

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    for _ in range(epochs):
        train_epoch(train_loader, model, loss_fn, optimizer)
        result = validate_epoch(validation_loader, model, loss_fn)
        train.report(**result)
        results.append(result)

    return results
Exemplo n.º 7
0
    def train_func(config):
        itr = 0
        ckpt = train.load_checkpoint()
        if ckpt is not None:
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)
Exemplo n.º 8
0
 def train_func():
     checkpoint = train.load_checkpoint()
     if checkpoint:
         epoch = checkpoint["epoch"]
     else:
         epoch = 0
     print("Epoch: ", epoch)
     for i in range(epoch, 2):
         train.report(loss=1, iter=i)
         train.save_checkpoint(epoch=i + 1)
Exemplo n.º 9
0
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_pipeline_shard = train.get_dataset_shard("train")
    validation_dataset_pipeline_shard = train.get_dataset_shard("validation")

    device = torch.device(
        f"cuda:{train.local_rank()}" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        torch.cuda.set_device(device)

    model = nn.Linear(1, hidden_size)
    model = model.to(device)
    model = DistributedDataParallel(
        model,
        device_ids=[train.local_rank()] if torch.cuda.is_available() else None)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    train_dataset_iterator = train_dataset_pipeline_shard.iter_datasets()
    validation_dataset_iterator = \
        validation_dataset_pipeline_shard.iter_datasets()

    for _ in range(epochs):
        train_dataset = next(train_dataset_iterator)
        validation_dataset = next(validation_dataset_iterator)

        train_torch_dataset = train_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=[torch.float],
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=[torch.float],
            batch_size=batch_size)

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                device)
        train.report(**result)
        results.append(result)

    return results
Exemplo n.º 10
0
        def train_loop_per_worker():
            import pandas as pd

            rank = train.world_rank()
            data_shard = train.get_dataset_shard("train")
            start = time.perf_counter()
            num_epochs, num_batches, num_bytes = 0, 0, 0
            batch_delays = []

            print("Starting train loop on worker", rank)
            while time.perf_counter() - start < runtime_seconds:
                num_epochs += 1
                batch_start = time.perf_counter()
                for batch in data_shard.iter_batches(
                    prefetch_blocks=prefetch_blocks, batch_size=batch_size
                ):
                    batch_delay = time.perf_counter() - batch_start
                    batch_delays.append(batch_delay)
                    num_batches += 1
                    if isinstance(batch, pd.DataFrame):
                        num_bytes += int(
                            batch.memory_usage(index=True, deep=True).sum()
                        )
                    elif isinstance(batch, np.ndarray):
                        num_bytes += batch.nbytes
                    else:
                        # NOTE: This isn't recursive and will just return the size of
                        # the object pointers if list of non-primitive types.
                        num_bytes += sys.getsizeof(batch)
                    train.report(
                        bytes_read=num_bytes,
                        num_batches=num_batches,
                        num_epochs=num_epochs,
                        batch_delay=batch_delay,
                    )
                    batch_start = time.perf_counter()
            delta = time.perf_counter() - start

            print("Time to read all data", delta, "seconds")
            print(
                "P50/P95/Max batch delay (s)",
                np.quantile(batch_delays, 0.5),
                np.quantile(batch_delays, 0.95),
                np.max(batch_delays),
            )
            print("Num epochs read", num_epochs)
            print("Num batches read", num_batches)
            print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB")
            print(
                "Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s"
            )

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats()))
Exemplo n.º 11
0
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Save DDP wrapped model.
        train.save_checkpoint(model=model)

        # Report DDP wrapped model.
        train.report(model=model)
Exemplo n.º 12
0
    def train_func():
        ckpt = train.load_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_pipeline_shard = train.get_dataset_shard("train")
    validation_dataset_pipeline_shard = train.get_dataset_shard("validation")

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs()
    validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs(
    )

    for _ in range(epochs):
        train_dataset = next(train_dataset_iterator)
        validation_dataset = next(validation_dataset_iterator)

        train_torch_dataset = train_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )

        device = train.torch.get_device()

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                device)
        train.report(**result)
        results.append(result)

    return results
Exemplo n.º 14
0
def training_loop(config):
    # Create model.
    model = ResNet18(config)
    model.conv1 = nn.Conv2d(1,
                            64,
                            kernel_size=7,
                            stride=1,
                            padding=3,
                            bias=False)
    model = train.torch.prepare_model(model)

    # Create optimizer.
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.1),
        momentum=config.get("momentum", 0.9),
    )

    # Load in training and validation data.
    train_dataset = load_mnist_data(True, True)
    validation_dataset = load_mnist_data(False, False)

    if config["test_mode"]:
        train_dataset = Subset(train_dataset, list(range(64)))
        validation_dataset = Subset(validation_dataset, list(range(64)))

    train_loader = DataLoader(train_dataset,
                              batch_size=config["batch_size"],
                              num_workers=2)
    validation_loader = DataLoader(validation_dataset,
                                   batch_size=config["batch_size"],
                                   num_workers=2)

    train_loader = train.torch.prepare_data_loader(train_loader)
    validation_loader = train.torch.prepare_data_loader(validation_loader)

    # Create loss.
    criterion = nn.CrossEntropyLoss()

    for epoch_idx in range(2):
        train_epoch(train_loader, model, criterion, optimizer)
        validation_loss = validate_epoch(validation_loader, model, criterion)

        train.save_checkpoint(model_state_dict=model.module.state_dict())
        train.report(**validation_loss)
Exemplo n.º 15
0
        def train_loop_per_worker():
            rank = train.world_rank()
            data_shard = train.get_dataset_shard("train")
            start = time.perf_counter()
            num_epochs, num_batches, num_bytes = 0, 0, 0
            batch_delays = []

            print("Starting train loop on worker", rank)
            while time.perf_counter() - start < runtime_seconds:
                num_epochs += 1
                batch_start = time.perf_counter()
                for batch in data_shard.iter_batches(
                        prefetch_blocks=prefetch_blocks,
                        batch_size=batch_size):
                    batch_delay = time.perf_counter() - batch_start
                    batch_delays.append(batch_delay)
                    num_batches += 1
                    num_bytes += int(
                        batch.memory_usage(index=True, deep=True).sum())
                    train.report(
                        bytes_read=num_bytes,
                        num_batches=num_batches,
                        num_epochs=num_epochs,
                        batch_delay=batch_delay,
                    )
                    batch_start = time.perf_counter()
            delta = time.perf_counter() - start

            print("Time to read all data", delta, "seconds")
            print(
                "P50/P95/Max batch delay (s)",
                np.quantile(batch_delays, 0.5),
                np.quantile(batch_delays, 0.95),
                np.max(batch_delays),
            )
            print("Num epochs read", num_epochs)
            print("Num batches read", num_batches)
            print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB")
            print("Mean throughput", round(num_bytes / (1024 * 1024) / delta,
                                           2), "MiB/s")

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(
                    data_shard.stats()))
Exemplo n.º 16
0
    def train_func():
        from ray.train.torch import TorchWorkerProfiler
        from torch.profiler import profile, record_function, schedule

        twp = TorchWorkerProfiler()
        with profile(
                activities=[],
                schedule=schedule(wait=0, warmup=0, active=1),
                on_trace_ready=twp.trace_handler,
        ) as p:

            for epoch in range(num_epochs):
                with record_function("test_function"):
                    pass

                p.step()

                profile_results = twp.get_and_clear_profile_traces()
                train.report(epoch=epoch, **profile_results)
Exemplo n.º 17
0
def train_loop_per_worker(config):
    import torch
    import horovod.torch as hvd

    hvd.init()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    mode = config["mode"]
    net = Net(mode).to(device)
    optimizer = torch.optim.SGD(
        net.parameters(),
        lr=config["lr"],
    )
    optimizer = hvd.DistributedOptimizer(optimizer)

    num_steps = 5
    print(hvd.size())
    np.random.seed(1 + hvd.rank())
    torch.manual_seed(1234)
    # To ensure consistent initialization across workers,
    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    start = time.time()
    x_max = config["x_max"]
    for step in range(1, num_steps + 1):
        features = torch.Tensor(np.random.rand(1) * 2 * x_max -
                                x_max).to(device)
        if mode == "square":
            labels = sq(features)
        else:
            labels = qu(features)
        optimizer.zero_grad()
        outputs = net(features)
        loss = torch.nn.MSELoss()(outputs, labels)
        loss.backward()

        optimizer.step()
        time.sleep(0.1)
        train.report(loss=loss.item())
    total = time.time() - start
    print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
Exemplo n.º 18
0
def train_func():
    twp = TorchWorkerProfiler()
    with profile(
            activities=[],
            schedule=schedule(wait=0, warmup=0, active=1),
            on_trace_ready=twp.trace_handler,
    ) as p:

        # Setup model.
        model = torch.nn.Linear(1, 1)
        model = train.torch.prepare_model(model)
        loss_fn = torch.nn.MSELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

        # Setup data.
        input = torch.randn(1000, 1)
        labels = input * 2
        dataset = torch.utils.data.TensorDataset(input, labels)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
        dataloader = train.torch.prepare_data_loader(dataloader)

        # Train.
        for epoch in range(5):
            with record_function("train_epoch"):
                for X, y in dataloader:
                    pred = model(X)
                    loss = loss_fn(pred, y)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            with record_function("train_checkpoint"):
                state_dict = model.state_dict()
                consume_prefix_in_state_dict_if_present(state_dict, "module.")
                train.save_checkpoint(epoch=epoch, model_weights=state_dict)

            p.step()

            with record_function("train_report"):
                profile_results = twp.get_and_clear_profile_traces()
                train.report(epoch=epoch, **profile_results)
Exemplo n.º 19
0
 def train_func():
     train.report(episode_reward_mean=4)
     train.report(episode_reward_mean=5)
     train.report(episode_reward_mean=6,
                  score=[1, 2, 3],
                  hello={"world": 1})
     return 1
Exemplo n.º 20
0
 def fail_train_2():
     for _ in range(2):
         train.report(loss=1)
     raise NotImplementedError
Exemplo n.º 21
0
 def on_epoch_end(self, epoch, logs=None):
     train.report(**logs)
Exemplo n.º 22
0
 def train_func():
     checkpoint = train.load_checkpoint()
     train.report(**checkpoint)
     train.save_checkpoint(**checkpoint)
     return checkpoint[key]
Exemplo n.º 23
0
 def train_mismatch():
     train.save_checkpoint(epoch=0)
     train.report(index=0)
     # skip checkpoint
     train.report(index=1)
Exemplo n.º 24
0
 def train_func():
     train.report(rank=train.world_rank())
Exemplo n.º 25
0
 def train_func(config):
     train.report(episode_reward_mean=4)
     train.report(episode_reward_mean=5)
     train.report(episode_reward_mean=6)
     return 1
Exemplo n.º 26
0
 def train_func():
     for i in range(num_iters):
         train.report(index=i)
     return 1
Exemplo n.º 27
0
 def train_func():
     for _ in range(2):
         train.report(loss=1)
     return 1
Exemplo n.º 28
0
def train_func():
    for i in range(3):
        train.report(epoch=i)
Exemplo n.º 29
0
 def train_actor_failure():
     for _ in range(2):
         train.report(loss=1)
     import sys
     sys.exit(0)
Exemplo n.º 30
0
 def train_func():
     for i in range(2):
         train.report(loss=1, iter=i)