def __init__(self, args, model, optims, loss, grad_accum_count=1, n_gpu=1, gpu_rank=1, report_manager=None): # Basic attributes. self.args = args self.save_checkpoint_steps = args.save_checkpoint_steps self.model = model self.optims = optims self.grad_accum_count = grad_accum_count self.n_gpu = n_gpu self.gpu_rank = gpu_rank self.report_manager = report_manager self.loss = loss assert grad_accum_count > 0 # Set model in training mode. if (model): self.model.train() log_model(self.model, "pytorch-model")
def train_model(num_epochs=20, batch_size=64, num_features=20, learning_rate=0.0001): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_data, val_data = load_data() train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False) model = LinearVAE(num_features).to(device) criterion = nn.BCELoss(reduction="sum") if not os.path.exists(OUTPUT_FOLDER_PATH): os.mkdir(OUTPUT_FOLDER_PATH) with start_run(): validate(model, val_loader, 0, criterion, device) val_epoch_loss = np.inf for epoch in range(1, num_epochs + 1): logging.info("Epoch %s/%s", epoch, num_epochs) train_epoch_loss = fit( model, train_loader, criterion, learning_rate, device, ) val_epoch_loss = validate(model, val_loader, epoch, criterion, device) log_metric("validation_loss", val_epoch_loss) logging.info( "Train Loss: %s", train_epoch_loss, ) logging.info(f"Val Loss: %s \n", val_epoch_loss) log_param("num_features", num_features) log_param("learning_rate", learning_rate) log_metric("num_epochs", num_epochs) log_artifacts(OUTPUT_FOLDER_PATH, ARTIFACTS_PATH) log_model(model, "LinearVAE")
def log_torch_model(self, model): with mlflow.start_run(self.run_id): pytorch.log_model(model, "models")
# Save checkpoint based on val if acc_vu > current_acc: torch.save({'model': net.state_dict(), 'acc_vu': acc_vu, "preds_vu": preds_vu, "softs_vu": softs_vu, "labs_vu": labs_vu, 'epoch': step}, ckpt) current_acc = acc_vu return current_acc # ================================================================================================================================= # training and logging mlflow.set_experiment(args.experiment) with mlflow.start_run(run_name=args.experiment_id): for _k, _v in vars(args).items(): if _k not in no_log_keys: mlflow.log_param(_k, _v) start_time = time() for epoch in tqdm(range(args.n_epochs), desc="epochs", disable=args.no_pbar): best_acc = train(vu_loader, vu_loader_test, epoch, best_acc) mlflow.log_metric("duration", time() - start_time) if args.tensorboard: print("Uploading TensorBoard events as a run artifact...") mlflow.log_artifacts(log_path) log_model(net, "model") mlflow.end_run()
def log_model(self): # management of device cpu and gpu go back # device = self.model.device with active_run(): pytorch.log_model(self.model.network, 'trained-model')
''' Let's see how mlflow's log_model works ''' import mlflow as mlf from mlflow.pytorch import log_model import torch class DummyModel(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) def forward(self, x): return self.linear(x) if __name__ == "__main__": mlf.set_experiment("Playground") with mlf.start_run(run_name="Testing mlflow log_model"): mlf.log_param("Hello", "World") mlf.log_metric("a", 0) a = DummyModel() log_model(a, "models")