示例#1
0
def train_mnist(config, checkpoint_dir=False):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    train_loader, test_loader = get_data_loaders()
    model = ConvNet().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint")) as f:
            model_state, optimizer_state = torch.load(f)

        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    model = DistributedDataParallel(model)

    for epoch in range(40):
        train(model, optimizer, train_loader, device)
        acc = test(model, test_loader, device)

        if epoch % 3 == 0:
            with distributed_checkpoint_dir(step=epoch) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(mean_accuracy=acc)
 def _train(self):
     train(self.model,
           self.optimizer,
           self.train_loader,
           device=self.device)
     acc = test(self.model, self.test_loader, self.device)
     return {"mean_accuracy": acc}
示例#3
0
 def step(self):
     self.current_ip()
     train(self.model,
           self.optimizer,
           self.train_loader,
           device=self.device)
     acc = test(self.model, self.test_loader, self.device)
     return {"mean_accuracy": acc}
示例#4
0
文件: example.py 项目: zuoxiaolei/ray
def train_mnist(config):
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(model.parameters(), lr=config["lr"])
    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        tune.track.log(mean_accuracy=acc)
示例#5
0
def test_best_model(analysis):
    """Test the best model given output of tune.run"""
    best_checkpoint_path = analysis.best_checkpoint
    best_model = ConvNet()
    best_checkpoint = torch.load(os.path.join(best_checkpoint_path, "checkpoint"))
    best_model.load_state_dict(best_checkpoint["model_state_dict"])
    # Note that test only runs on a small random set of the test data, thus the
    # accuracy may be different from metrics shown in tuning process.
    test_acc = test(best_model, get_data_loaders()[1])
    print("best model accuracy: ", test_acc)
def train_mnist(config):
    train_loader, test_loader = get_data_loaders()
    print("train num: ", len(train_loader))
    print("test num: ", len(test_loader))
    model = ConvNet()
    optimizer = optim.SGD(model.parameters(), lr=config["lr"])
    for i in range(30):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        tune.report(mean_accuracy=acc)  # 添加的代码
示例#7
0
def train_mnist(config):
    model = ConvNet()
    train_loader, test_loader = get_data_loaders()
    optimizer = optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"])
    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        track.log(mean_accuracy=acc)
        if i % 5 == 0:
            # This saves the model to the trial directory
            torch.save(model, "./model.pth")
示例#8
0
def train_mnist(config):
    model = ConvNet()
    train_loader, test_loader = get_data_loaders()

    optimizer = optim.SGD(model.parameters(),
                          lr=config["lr"],
                          momentum=config["momentum"])

    for i in range(20):
        train(model, optimizer, train_loader)  # Train for 1 epoch
        acc = test(model, test_loader)  # Obtain validation accuracy.
        tune.track.log(mean_accuracy=acc)  # here
        if i % 5 == 0:
            torch.save(
                model,
                "./model.pth")  # This saves the model to the trial directory
def train_convnet(config):
    # Create our data loaders, model, and optmizer.
    step = 0
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.01),
        momentum=config.get("momentum", 0.9),
    )

    # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint.
    # Load model state and iteration step from checkpoint.
    if session.get_checkpoint():
        print("Loading from checkpoint.")
        loaded_checkpoint = session.get_checkpoint()
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            checkpoint = torch.load(path)
            model.load_state_dict(checkpoint["model_state_dict"])
            step = checkpoint["step"]

    while True:
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        checkpoint = None
        if step % 5 == 0:
            # Every 5 steps, checkpoint our current state.
            # First get the checkpoint directory from tune.
            # Need to create a directory under current working directory
            # to construct an AIR Checkpoint object from.
            os.makedirs("my_model", exist_ok=True)
            torch.save(
                {
                    "step": step,
                    "model_state_dict": model.state_dict(),
                },
                "my_model/checkpoint.pt",
            )
            checkpoint = Checkpoint.from_directory("my_model")

        step += 1
        session.report({"mean_accuracy": acc}, checkpoint=checkpoint)
示例#10
0
def train_convnet(config, checkpoint_dir=None):
    # Create our data loaders, model, and optmizer.
    step = 0
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.01),
        momentum=config.get("momentum", 0.9),
    )

    # If checkpoint_dir is not None, then we are resuming from a checkpoint.
    # Load model state and iteration step from checkpoint.
    if checkpoint_dir:
        print("Loading from checkpoint.")
        path = os.path.join(checkpoint_dir, "checkpoint")
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint["model_state_dict"])
        step = checkpoint["step"]

    while True:
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        if step % 5 == 0:
            # Every 5 steps, checkpoint our current state.
            # First get the checkpoint directory from tune.
            with tune.checkpoint_dir(step=step) as checkpoint_dir:
                # Then create a checkpoint file in this directory.
                path = os.path.join(checkpoint_dir, "checkpoint")
                # Save state to checkpoint file.
                # No need to save optimizer for SGD.
                torch.save(
                    {
                        "step": step,
                        "model_state_dict": model.state_dict(),
                        "mean_accuracy": acc,
                    },
                    path,
                )
        step += 1
        tune.report(mean_accuracy=acc)
示例#11
0
def train_mnist(config):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_loader, test_loader = get_data_loaders()

    model = ConvNet()
    model.to(device)

    optimizer = optim.SGD(model.parameters(),
                          lr=config["lr"],
                          momentum=config["momentum"])

    for _i in range(10):
        train(model, optimizer, train_loader, device=device)
        acc = test(model, test_loader, device=device)

        # When using WandbLogger, the metrics reported to tune are also logged in the W&B dashboard
        tune.report(mean_accuracy=acc)

        # @wandb_mixin enables logging custom metric using wandb.log()
        error_rate = 100 * (1 - acc)
        wandb.log({"error_rate": error_rate})
    stopper = CustomStopper()

    analysis = tune.run(
        PytorchTrainble,
        name="pbt_test",
        scheduler=scheduler,
        reuse_actors=True,
        verbose=1,
        stop=stopper,
        export_formats=[ExportFormat.MODEL],
        checkpoint_score_attr="mean_accuracy",
        checkpoint_freq=5,
        keep_checkpoints_num=4,
        num_samples=4,
        config={
            "lr": tune.uniform(0.001, 1),
            "momentum": tune.uniform(0.001, 1),
        })
    # __tune_end__

    best_trial = analysis.get_best_trial("mean_accuracy")
    best_checkpoint = max(
        analysis.get_trial_checkpoints_paths(best_trial, "mean_accuracy"))
    restored_trainable = PytorchTrainble()
    restored_trainable.restore(best_checkpoint[0])
    best_model = restored_trainable.model
    # Note that test only runs on a small random set of the test data, thus the
    # accuracy may be different from metrics shown in tuning process.
    test_acc = test(best_model, get_data_loaders()[1])
    print("best model accuracy: ", test_acc)
 def step(self):
     train(self.model, self.optimizer, self.train_loader)
     acc = test(self.model, self.test_loader)
     return {"mean_accuracy": acc}