예제 #1
0
    def __init__(self,
                 args,
                 model,
                 optims,
                 loss,
                 grad_accum_count=1,
                 n_gpu=1,
                 gpu_rank=1,
                 report_manager=None):
        # Basic attributes.
        self.args = args
        self.save_checkpoint_steps = args.save_checkpoint_steps
        self.model = model
        self.optims = optims
        self.grad_accum_count = grad_accum_count
        self.n_gpu = n_gpu
        self.gpu_rank = gpu_rank
        self.report_manager = report_manager

        self.loss = loss

        assert grad_accum_count > 0
        # Set model in training mode.
        if (model):
            self.model.train()
            log_model(self.model, "pytorch-model")
예제 #2
0
def train_model(num_epochs=20,
                batch_size=64,
                num_features=20,
                learning_rate=0.0001):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, val_data = load_data()

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    model = LinearVAE(num_features).to(device)
    criterion = nn.BCELoss(reduction="sum")

    if not os.path.exists(OUTPUT_FOLDER_PATH):
        os.mkdir(OUTPUT_FOLDER_PATH)

    with start_run():

        validate(model, val_loader, 0, criterion, device)
        val_epoch_loss = np.inf

        for epoch in range(1, num_epochs + 1):
            logging.info("Epoch %s/%s", epoch, num_epochs)
            train_epoch_loss = fit(
                model,
                train_loader,
                criterion,
                learning_rate,
                device,
            )
            val_epoch_loss = validate(model, val_loader, epoch, criterion,
                                      device)
            log_metric("validation_loss", val_epoch_loss)
            logging.info(
                "Train Loss: %s",
                train_epoch_loss,
            )
            logging.info(f"Val Loss: %s \n", val_epoch_loss)

        log_param("num_features", num_features)
        log_param("learning_rate", learning_rate)
        log_metric("num_epochs", num_epochs)
        log_artifacts(OUTPUT_FOLDER_PATH, ARTIFACTS_PATH)
        log_model(model, "LinearVAE")
예제 #3
0
 def log_torch_model(self, model):
     with mlflow.start_run(self.run_id):
         pytorch.log_model(model, "models")
예제 #4
0
    # Save checkpoint based on val
    if acc_vu > current_acc:
        torch.save({'model': net.state_dict(), 'acc_vu': acc_vu, "preds_vu": preds_vu, "softs_vu": softs_vu, "labs_vu": labs_vu, 'epoch': step}, ckpt)
        current_acc = acc_vu

    return current_acc

# =================================================================================================================================
# training and logging

mlflow.set_experiment(args.experiment)


with mlflow.start_run(run_name=args.experiment_id):
    for _k, _v in vars(args).items():
        if _k not in no_log_keys:
            mlflow.log_param(_k, _v)

    start_time = time()
    for epoch in tqdm(range(args.n_epochs), desc="epochs", disable=args.no_pbar):
        best_acc = train(vu_loader, vu_loader_test, epoch, best_acc)
    mlflow.log_metric("duration", time() - start_time)

    if args.tensorboard:
        print("Uploading TensorBoard events as a run artifact...")
        mlflow.log_artifacts(log_path)

    log_model(net, "model")
    mlflow.end_run()
예제 #5
0
 def log_model(self):  # management of device cpu and gpu go back
     # device = self.model.device
     with active_run():
         pytorch.log_model(self.model.network, 'trained-model')
예제 #6
0
'''
Let's see how mlflow's log_model works
'''
import mlflow as mlf
from mlflow.pytorch import log_model
import torch


class DummyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.linear = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.linear(x)


if __name__ == "__main__":
    mlf.set_experiment("Playground")

    with mlf.start_run(run_name="Testing mlflow log_model"):
        mlf.log_param("Hello", "World")
        mlf.log_metric("a", 0)
        a = DummyModel()

        log_model(a, "models")