def train_mnist(config, checkpoint_dir=False): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") train_loader, test_loader = get_data_loaders() model = ConvNet().to(device) optimizer = optim.SGD(model.parameters(), lr=0.1) if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: model_state, optimizer_state = torch.load(f) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) model = DistributedDataParallel(model) for epoch in range(40): train(model, optimizer, train_loader, device) acc = test(model, test_loader, device) if epoch % 3 == 0: with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(mean_accuracy=acc)
def _train(self): train(self.model, self.optimizer, self.train_loader, device=self.device) acc = test(self.model, self.test_loader, self.device) return {"mean_accuracy": acc}
def step(self): self.current_ip() train(self.model, self.optimizer, self.train_loader, device=self.device) acc = test(self.model, self.test_loader, self.device) return {"mean_accuracy": acc}
def train_mnist(config): train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD(model.parameters(), lr=config["lr"]) for i in range(10): train(model, optimizer, train_loader) acc = test(model, test_loader) tune.track.log(mean_accuracy=acc)
def test_best_model(analysis): """Test the best model given output of tune.run""" best_checkpoint_path = analysis.best_checkpoint best_model = ConvNet() best_checkpoint = torch.load(os.path.join(best_checkpoint_path, "checkpoint")) best_model.load_state_dict(best_checkpoint["model_state_dict"]) # Note that test only runs on a small random set of the test data, thus the # accuracy may be different from metrics shown in tuning process. test_acc = test(best_model, get_data_loaders()[1]) print("best model accuracy: ", test_acc)
def train_mnist(config): train_loader, test_loader = get_data_loaders() print("train num: ", len(train_loader)) print("test num: ", len(test_loader)) model = ConvNet() optimizer = optim.SGD(model.parameters(), lr=config["lr"]) for i in range(30): train(model, optimizer, train_loader) acc = test(model, test_loader) tune.report(mean_accuracy=acc) # 添加的代码
def train_mnist(config): model = ConvNet() train_loader, test_loader = get_data_loaders() optimizer = optim.SGD( model.parameters(), lr=config["lr"], momentum=config["momentum"]) for i in range(10): train(model, optimizer, train_loader) acc = test(model, test_loader) track.log(mean_accuracy=acc) if i % 5 == 0: # This saves the model to the trial directory torch.save(model, "./model.pth")
def train_mnist(config): model = ConvNet() train_loader, test_loader = get_data_loaders() optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) for i in range(20): train(model, optimizer, train_loader) # Train for 1 epoch acc = test(model, test_loader) # Obtain validation accuracy. tune.track.log(mean_accuracy=acc) # here if i % 5 == 0: torch.save( model, "./model.pth") # This saves the model to the trial directory
def train_convnet(config): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if session.get_checkpoint(): print("Loading from checkpoint.") loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) checkpoint = None if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. # Need to create a directory under current working directory # to construct an AIR Checkpoint object from. os.makedirs("my_model", exist_ok=True) torch.save( { "step": step, "model_state_dict": model.state_dict(), }, "my_model/checkpoint.pt", ) checkpoint = Checkpoint.from_directory("my_model") step += 1 session.report({"mean_accuracy": acc}, checkpoint=checkpoint)
def train_convnet(config, checkpoint_dir=None): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If checkpoint_dir is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if checkpoint_dir: print("Loading from checkpoint.") path = os.path.join(checkpoint_dir, "checkpoint") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. with tune.checkpoint_dir(step=step) as checkpoint_dir: # Then create a checkpoint file in this directory. path = os.path.join(checkpoint_dir, "checkpoint") # Save state to checkpoint file. # No need to save optimizer for SGD. torch.save( { "step": step, "model_state_dict": model.state_dict(), "mean_accuracy": acc, }, path, ) step += 1 tune.report(mean_accuracy=acc)
def train_mnist(config): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_loader, test_loader = get_data_loaders() model = ConvNet() model.to(device) optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) for _i in range(10): train(model, optimizer, train_loader, device=device) acc = test(model, test_loader, device=device) # When using WandbLogger, the metrics reported to tune are also logged in the W&B dashboard tune.report(mean_accuracy=acc) # @wandb_mixin enables logging custom metric using wandb.log() error_rate = 100 * (1 - acc) wandb.log({"error_rate": error_rate})
stopper = CustomStopper() analysis = tune.run( PytorchTrainble, name="pbt_test", scheduler=scheduler, reuse_actors=True, verbose=1, stop=stopper, export_formats=[ExportFormat.MODEL], checkpoint_score_attr="mean_accuracy", checkpoint_freq=5, keep_checkpoints_num=4, num_samples=4, config={ "lr": tune.uniform(0.001, 1), "momentum": tune.uniform(0.001, 1), }) # __tune_end__ best_trial = analysis.get_best_trial("mean_accuracy") best_checkpoint = max( analysis.get_trial_checkpoints_paths(best_trial, "mean_accuracy")) restored_trainable = PytorchTrainble() restored_trainable.restore(best_checkpoint[0]) best_model = restored_trainable.model # Note that test only runs on a small random set of the test data, thus the # accuracy may be different from metrics shown in tuning process. test_acc = test(best_model, get_data_loaders()[1]) print("best model accuracy: ", test_acc)
def step(self): train(self.model, self.optimizer, self.train_loader) acc = test(self.model, self.test_loader) return {"mean_accuracy": acc}