def run(self): tensorboard_dir = os.path.join( os.getenv("ADAPTDL_TENSORBOARD_LOGDIR", "/tmp"), adaptdl.env.job_id()) with SummaryWriter(tensorboard_dir) as writer: for epoch in adl.remaining_epochs_until(args.epochs): epoch_start_time = time.time() self.train(train_txt, epoch, writer) val_loss = self.evaluate(self.model, val_txt, epoch, writer) print('-' * 89) print(f'| end of epoch {epoch:3d} ' f'| time: {(time.time() - epoch_start_time):5.2f}s ' f'| valid loss {val_loss:5.2f} ' f'| valid ppl {np.exp(val_loss):8.2f}') print('-' * 89) if val_loss < self.best_val_loss: self.best_val_loss = val_loss self.best_model = self.model self.scheduler.step() test_loss = self.evaluate(self.best_model, test_txt) print('=' * 89) print(f'| End of training | test loss {test_loss:5.2f} | ' f'test ppl {np.exp(test_loss):8.2f}') print('=' * 89)
def test_single_replica_parallel(): adl.init_process_group("gloo") true_values = np.asarray([3.0, 4.0]) dataset = LRIterableDataset(1000, true_values, 1.0) dataloader = adl.AdaptiveDataLoader(dataset, batch_size=32, shuffle=False, num_workers=1) model = torch.nn.Linear(1, 1, bias=True) params = [model.bias, model.weight] sgd = torch.optim.SGD([{"params": [param]} for param in params], lr=0.01) schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, [50]) model = adl.AdaptiveDataParallel(model, sgd, schedule) loss = torch.nn.MSELoss() for epoch in adl.remaining_epochs_until(100): for inputs, targets in dataloader: inputs = inputs.float() targets = targets.float() sgd.zero_grad() output = model(torch.reshape(inputs, (-1, 1))) targets = torch.reshape(targets, (-1, 1)) loss_value = loss(output, targets) loss_value.backward() sgd.step() schedule.step() params = np.asarray([param.item() for param in params]) assert(np.all(np.isclose(params, true_values, atol=0.1))), \ (params, true_values)
def train_mnist(config: Dict, checkpoint_dir: Optional[str] = None): # Data Setup mnist_transforms = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) train_loader = adl.AdaptiveDataLoader(datasets.MNIST( "~/data", train=True, download=True, transform=mnist_transforms), batch_size=64, shuffle=True) # Autoscale batch size train_loader.autoscale_batch_size(4096, local_bsz_bounds=(16, 1024)) test_loader = adl.AdaptiveDataLoader(datasets.MNIST( "~/data", train=False, transform=mnist_transforms), batch_size=64, shuffle=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = ConvNet() optimizer = optim.SGD(model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.79)) model.to(device) model = adl.AdaptiveDataParallel(model, optimizer) for epoch in adl.remaining_epochs_until(config.get("epochs", 10)): train(model, optimizer, train_loader) acc = test(model, test_loader) # Send the current training result back to Tune tune.report(mean_accuracy=acc)
def _train_simple(config: Dict, checkpoint_dir: Optional[str] = None): device = "cuda:0" if torch.cuda.is_available() else "cpu" H = config.get("H", 16) N = config.get("N", 16) # Create random Tensors to hold inputs and outputs dataloader = adl.AdaptiveDataLoader(dataset, batch_size=N) dataloader.autoscale_batch_size(4096, local_bsz_bounds=(16, 1024)) loss_fn = nn.MSELoss() # Use the nn package to define our model and loss function. model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) optimizer = optim.SGD(model.parameters(), lr=0.1) model = model.to(device) model = adl.AdaptiveDataParallel(model, optimizer) loss = torch.Tensor([0.0]) for epoch in adl.remaining_epochs_until(config.get("epochs", 10)): for (x, y) in dataloader: x, y = x.to(device), y.to(device) optimizer.zero_grad() output = model(x) loss = loss_fn(output, y) loss.backward() optimizer.step() tune.report(mean_loss=loss.item())
def _train_simple(config: Dict, checkpoint_dir: Optional[str] = None): import torch import torch.nn as nn import torch.optim as optim import adaptdl.torch as adl from ray import tune class MyDataset: def __init__(self, xs, ys): self.xs = xs self.ys = ys def __getitem__(self, i): return self.xs[i], self.ys[i] def __len__(self): return len(self.xs) # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 5, 5, 5 dataset = MyDataset(torch.randn(N, D_in), torch.randn(N, D_out)) H = config.get("H", 16) N = config.get("N", 16) # Create random Tensors to hold inputs and outputs dataloader = adl.AdaptiveDataLoader(dataset, batch_size=N) dataloader.autoscale_batch_size(4096, local_bsz_bounds=(16, 1024)) loss_fn = nn.MSELoss() # Use the nn package to define our model and loss function. model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) optimizer = optim.SGD(model.parameters(), lr=0.1) model = adl.AdaptiveDataParallel(model, optimizer) loss = torch.Tensor([0.0]) for epoch in adl.remaining_epochs_until(config.get("epochs", 10)): for (x, y) in dataloader: optimizer.zero_grad() output = model(x) loss = loss_fn(output, y) loss.backward() optimizer.step() tune.report(mean_loss=loss.item())
self.data = list(zip(x, y)) def __getitem__(self, index): return self.data[index] def __len__(self): return len(self.data) dataset = SimpleDataset(10000) dataloader = adl.AdaptiveDataLoader(dataset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) lr_scheduler = MultiStepLR(optimizer, [30, 45], 0.1) net = adl.AdaptiveDataParallel(net, optimizer, lr_scheduler) trainer = Trainer(net, optimizer, lr_scheduler) for epoch in adl.remaining_epochs_until(args.epochs): for inputs, targets in dataloader: batch_stat = trainer.train(inputs, targets) print(batch_stat)
# Save Losses for plotting later G_losses.append(errG.item()) D_losses.append(errD.item()) stats["g_loss_sum"] += errG.item() stats["d_loss_sum"] += errD.item() stats["norm"] += metrics._metrics_state().grad_params[0] stats["var"] += metrics._metrics_state().grad_params[1] stats["replicas"] += 1.0 scheduleD.step() scheduleG.step() with stats.synchronized(): with SummaryWriter(adaptdl.get_tensorboard_dir()) as writer: writer.add_scalar("Loss/G", stats["g_loss_sum"] / stats["replicas"], epoch) writer.add_scalar("Loss/D", stats["d_loss_sum"] / stats["replicas"], epoch) writer.add_scalar("Performance/GlobalBatchsize", b_size * stats["replicas"], epoch) writer.add_scalar("Performance/Replicas", stats["replicas"], epoch) writer.add_scalar("Stats/Variance", stats["norm"] / stats["replicas"], epoch) writer.add_scalar("Stats/Norm", stats["var"] / stats["replicas"], epoch) for epoch in adl.remaining_epochs_until(num_epochs): train(epoch)