class TrainMNIST(tune.Trainable): def setup(self, config): use_cuda = config.get("use_gpu") and torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.train_loader, self.test_loader = get_data_loaders() self.model = ConvNet().to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9)) def step(self): self.current_ip() train(self.model, self.optimizer, self.train_loader, device=self.device) acc = test(self.model, self.test_loader, self.device) return {"mean_accuracy": acc} def save_checkpoint(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "model.pth") torch.save(self.model.state_dict(), checkpoint_path) return checkpoint_path def load_checkpoint(self, checkpoint_path): self.model.load_state_dict(torch.load(checkpoint_path)) # this is currently needed to handle Cori GPU multiple interfaces def current_ip(self): import socket hostname = socket.getfqdn(socket.gethostname()) self._local_ip = socket.gethostbyname(hostname) return self._local_ip
class PytorchTrainble(tune.Trainable): def _setup(self, config): self.device = torch.device("cpu") self.train_loader, self.test_loader = get_data_loaders() self.model = ConvNet().to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9)) def _train(self): train(self.model, self.optimizer, self.train_loader, device=self.device) acc = test(self.model, self.test_loader, self.device) return {"mean_accuracy": acc} def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "model.pth") torch.save(self.model.state_dict(), checkpoint_path) return checkpoint_path def _restore(self, checkpoint_path): self.model.load_state_dict(torch.load(checkpoint_path)) def reset_config(self, new_config): del self.optimizer self.optimizer = optim.SGD(self.model.parameters(), lr=new_config.get("lr", 0.01), momentum=new_config.get("momentum", 0.9)) return True
def train_mnist(config, checkpoint_dir=False): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") train_loader, test_loader = get_data_loaders() model = ConvNet().to(device) optimizer = optim.SGD(model.parameters(), lr=0.1) if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: model_state, optimizer_state = torch.load(f) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) model = DistributedDataParallel(model) for epoch in range(40): train(model, optimizer, train_loader, device) acc = test(model, test_loader, device) if epoch % 3 == 0: with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(mean_accuracy=acc)
class TrainMNIST(tune.Trainable): def _setup(self, config): use_cuda = config.get("use_gpu") and torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.train_loader, self.test_loader = get_data_loaders() self.model = ConvNet().to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9)) def _train(self): train(self.model, self.optimizer, self.train_loader, device=self.device) acc = test(self.model, self.test_loader, self.device) return {"mean_accuracy": acc} def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "model.pth") torch.save(self.model.state_dict(), checkpoint_path) return checkpoint_path def _restore(self, checkpoint_path): self.model.load_state_dict(torch.load(checkpoint_path))
class PytorchTrainble(tune.Trainable): """Train a Pytorch ConvNet with Trainable and PopulationBasedTraining scheduler. The example reuse some of the functions in mnist_pytorch, and is a good demo for how to add the tuning function without changing the original training code. """ def _setup(self, config): self.train_loader, self.test_loader = get_data_loaders() self.model = ConvNet() self.optimizer = optim.SGD(self.model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9)) def _train(self): train(self.model, self.optimizer, self.train_loader) acc = test(self.model, self.test_loader) return {"mean_accuracy": acc} def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "model.pth") torch.save(self.model.state_dict(), checkpoint_path) return checkpoint_path def _restore(self, checkpoint_path): self.model.load_state_dict(torch.load(checkpoint_path)) def reset_config(self, new_config): del self.optimizer self.optimizer = optim.SGD(self.model.parameters(), lr=new_config.get("lr", 0.01), momentum=new_config.get("momentum", 0.9)) return True
class PytorchTrainble(tune.Trainable): """Train a Pytorch ConvNet with Trainable and PopulationBasedTraining scheduler. The example reuse some of the functions in mnist_pytorch, and is a good demo for how to add the tuning function without changing the original training code. """ def _setup(self, config): self.train_loader, self.test_loader = get_data_loaders() self.model = ConvNet() self.optimizer = optim.SGD( self.model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9)) def _train(self): train(self.model, self.optimizer, self.train_loader) acc = test(self.model, self.test_loader) return {"mean_accuracy": acc} def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "model.pth") torch.save(self.model.state_dict(), checkpoint_path) return checkpoint_path def _restore(self, checkpoint_path): self.model.load_state_dict(torch.load(checkpoint_path)) def _export_model(self, export_formats, export_dir): if export_formats == [ExportFormat.MODEL]: path = os.path.join(export_dir, "exported_convnet.pt") torch.save(self.model.state_dict(), path) return {export_formats[0]: path} else: raise ValueError("unexpected formats: " + str(export_formats)) def reset_config(self, new_config): for param_group in self.optimizer.param_groups: if "lr" in new_config: param_group["lr"] = new_config["lr"] if "momentum" in new_config: param_group["momentum"] = new_config["momentum"] self.config = new_config return True
def train_convnet(config): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if session.get_checkpoint(): print("Loading from checkpoint.") loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) checkpoint = None if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. # Need to create a directory under current working directory # to construct an AIR Checkpoint object from. os.makedirs("my_model", exist_ok=True) torch.save( { "step": step, "model_state_dict": model.state_dict(), }, "my_model/checkpoint.pt", ) checkpoint = Checkpoint.from_directory("my_model") step += 1 session.report({"mean_accuracy": acc}, checkpoint=checkpoint)
def train_convnet(config, checkpoint_dir=None): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If checkpoint_dir is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if checkpoint_dir: print("Loading from checkpoint.") path = os.path.join(checkpoint_dir, "checkpoint") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. with tune.checkpoint_dir(step=step) as checkpoint_dir: # Then create a checkpoint file in this directory. path = os.path.join(checkpoint_dir, "checkpoint") # Save state to checkpoint file. # No need to save optimizer for SGD. torch.save( { "step": step, "model_state_dict": model.state_dict(), "mean_accuracy": acc, }, path, ) step += 1 tune.report(mean_accuracy=acc)