def train_and_evaluate(model, data_iter_tr, data_iter_ts, loss_fn, optimizer, metrics, exp_dir): loss_vect = [] n_epoch = ut.model_param['num_epochs'] for epoch in range(1, n_epoch + 1): print('Epoch {0} of {1}'.format(epoch, n_epoch)) start = time() mrn, encoded, encoded_avg, loss_mean = train(model, optimizer, loss_fn, data_iter_tr) print('-- time = ', round(time() - start, 3)) print('-- mean loss: {0}'.format(round(loss_mean, 3))) loss_vect.append(loss_mean) is_best_1 = loss_mean < 0.1 is_best_2 = epoch == n_epoch if is_best_1 or is_best_2: outfile = os.path.join(exp_dir, 'TRconvae-avg_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) for m, e in zip(mrn, encoded_avg): wr.writerow([m] + e) outfile = os.path.join(exp_dir, 'TRconvae_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) for m, evs in zip(mrn, encoded): for e in evs: wr.writerow([m] + e) outfile = os.path.join(exp_dir, 'TRmetrics.txt') with open(outfile, 'w') as f: f.write('Mean Loss: %.3f\n' % loss_mean) outfile = os.path.join(exp_dir, 'TRlosses.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(['Epoch', 'Loss']) for idx, l in enumerate(loss_vect): wr.writerow([idx, l]) print('\nFound new best model at epoch {0}'.format(epoch)) ut.save_best_model(epoch, model, optimizer, loss_mean, exp_dir) print('\nEvaluating the model') mrn, encoded, encoded_avg, test_metrics = evaluate(model, loss_fn, data_iter_ts, metrics, best_eval=True) return mrn, encoded, encoded_avg, test_metrics
def train_and_evaluate(model, data_iterator, loss_fn, optimizer, model_dir, metrics, experiment_folder): #best_eval_acc = 0.0 num_epochs = model_pars['num_epochs'] for epoch in range(num_epochs): print("Epoch {0} of {1}".format(epoch, num_epochs)) mrn, encoded, loss_mean = train(model, optimizer, loss_fn, data_iterator) print("Mean loss: {0}, epoch {1}".format(loss_mean, epoch)) #with torch.no_grad(): # _, _, test_metrics = evaluate(model, loss_fn, data_iterator, metrics) #acc_epoch = test_metrics['accuracy'] #is_best = acc_epoch < best_eval_acc is_best = loss_mean < 0.001 if (is_best or epoch == (num_epochs - 1)): with open(experiment_folder + '/TRencoded_vect.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for e in encoded: wr.writerow(e) with open(experiment_folder + '/TRmrns.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for m in mrn: wr.writerow([m]) with open(experiment_folder + '/TRmetrics.txt', 'w') as f: wr = csv.writer(f, delimiter='\t') #for m, v in metrics_average.items(): # wr.writerow([m, v]) wr.writerow(["Mean loss:", loss_mean]) #utils.save_best_model({'epoch':epoch, # 'state_dict':model.state_dict(), # 'optim_dict':optimizer.state_dict(), # #'best_acc':acc_epoch # }, # folder=model_dir) print("-- Found new best at epoch {0}".format(epoch)) utils.save_best_model(model, experiment_folder) print("Evaluating the model...") mrn, encoded, test_metrics = evaluate(model, loss_fn, data_iterator, metrics, best_eval=True) #acc_epoch = test_metrics['accuracy'] #is_best = acc_epoch < best_eval_acc return mrn, encoded, test_metrics
def train( net, criterion, args, experiment_dir, train_loader, valid_loader ): optimizer = torch.optim.Adamax(net.parameters(), lr=args.lrate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max') # Nb epochs completed tracked in case training interrupted for i in range(args.nb_epochs_complete, args.nb_epoch): # Update learning rate in optimizer t0 = time.time() logging.info("\nEpoch {}".format(i+1)) logging.info("Learning rate: {0:.3g}".format(args.lrate)) train_stats = train_one_epoch(net, criterion, optimizer, args, experiment_dir, train_loader) val_stats = evaluate(net, criterion, experiment_dir, args, valid_loader, 'Valid') utils.track_epoch_stats(i, args.lrate, 0, train_stats, val_stats, experiment_dir) # Update learning rate, remaining nb epochs to train scheduler.step(val_stats[0]) args.lrate = optimizer.param_groups[0]['lr'] args.nb_epochs_complete += 1 # Track best model performance if (val_stats[0] > args.best_tpr): logging.warning("Best performance on valid set.") args.best_tpr = float(val_stats[0]) utils.update_best_plots(experiment_dir) utils.save_best_model(experiment_dir, net) utils.save_best_scores(i, val_stats[2], val_stats[0], val_stats[1], experiment_dir) utils.save_epoch_model(experiment_dir, net) utils.save_args(experiment_dir, args) logging.info("Epoch took {} seconds.".format(int(time.time()-t0))) if args.lrate < 10**-6: logging.warning("Minimum learning rate reched.") break logging.warning("Training completed.")
def train_and_evaluate(model, data_iterator, loss_fn, optimizer, metrics, experiment_folder): num_epochs = model_pars['num_epochs'] loss_vect = [] for epoch in range(num_epochs): print("Epoch {0} of {1}".format(epoch, num_epochs)) mrn, encoded, loss_mean = train(model, optimizer, loss_fn, data_iterator) print("Mean loss: {0}, epoch {1}".format(loss_mean, epoch)) loss_vect.append(loss_mean) is_best = loss_mean < 0.001 if (is_best or epoch == (num_epochs - 1)): with open(experiment_folder + '/TRencoded_vect.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for e in encoded: wr.writerow(e) with open(experiment_folder + '/TRmrns.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for m in mrn: wr.writerow([m]) with open(experiment_folder + '/TRmetrics.txt', 'w') as f: wr = csv.writer(f, delimiter='\t') wr.writerow(["Mean loss:", loss_mean]) with open(experiment_folder + '/TRlosses.txt', 'w') as f: wr = csv.writer(f, delimiter=',') wr.writerow(["Epoch", "loss"]) for idx, l in enumerate(loss_vect): wr.writerow([idx, l]) print("-- Found new best at epoch {0}".format(epoch)) utils.save_best_model(model, experiment_folder) print("Evaluating the model...") mrn, encoded, test_metrics = evaluate(model, loss_fn, data_iterator, metrics, best_eval=True) return mrn, encoded, test_metrics
def on_after_epoch(model, df_hist, images, epoch, saveEpoch): utils.save_best_model(MODEL_PATH, model, df_hist) utils.checkpoint_model(MODEL_PATH, model, epoch, saveEpoch) utils.log_hist(logger, df_hist) utils.write_on_board_losses_stg2(writer, df_hist) utils.write_on_board_images_stg2(writer, images, epoch)
def train(model, optim, sche, db, opt, exp_id): """ Args: model: the model to be trained optim: pytorch optimizer to be used db : prepared torch dataset object opt: command line input from the user exp_id: experiment id """ best_model_dir = os.path.join(opt.save_dir, str(exp_id)) if not os.path.exists(best_model_dir): os.makedirs(best_model_dir) # (For FG-NET only) carry out leave-one-out validation according to the list length assert len(db['train']) == len(db['eval']) # record for each training experiment best_MAE = [] train_set = db['train'][exp_id] eval_set = db['eval'][exp_id] eval_loss, min_MAE, _ = evaluate(model, eval_set, opt) # in drop out mode, each time only leaf nodes of one tree is updated if opt.dropout: current_tree = 0 # save training and validation history if opt.history: train_loss_history = [] eval_loss_history = [] for epoch in range(1, opt.epochs + 1): # At each epoch, train the neural decision forest and update # the leaf node distribution separately # Train neural decision forest # set the model in the training mode model.train() # data loader train_loader = torch.utils.data.DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_threads) for batch_idx, batch in enumerate(train_loader): data = batch['image'] target = batch['age'] target = target.view(len(target), -1) if opt.cuda: with torch.no_grad(): # move to GPU data, target = data.cuda(), target.cuda() # erase all computed gradient optim.zero_grad() #prediction, decision_loss = model(data) # forward pass to get prediction prediction, reg_loss = model(data) loss = F.mse_loss(prediction, target) + reg_loss # compute gradient in the computational graph loss.backward() # update parameters in the model optim.step() # logging if batch_idx % opt.report_every == 0: logging.info( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} '.format( epoch, batch_idx * opt.batch_size, len(train_set), 100. * batch_idx / len(train_loader), loss.data.item())) # record loss if opt.history: train_loss_history.append((epoch, batch_idx, loss.data.item())) # Update the leaf node estimation if opt.leaf_node_type == 'simple' and batch_idx % opt.update_every == 0: logging.info("Epoch %d : Update leaf node prediction" % (epoch)) target_batches = prepare_batches(model, train_set, opt) # Update label prediction for each tree logging.info("Update leaf node prediction...") for i in range(opt.label_iter_time): # prepare features from the last feature layer # some cache is also stored in the forest for leaf node if opt.dropout: model.forest.trees[ current_tree].update_label_distribution( target_batches) current_tree = (current_tree + 1) % opt.n_tree else: for tree in model.forest.trees: tree.update_label_distribution(target_batches) # release cache for tree in model.forest.trees: del tree.mu_cache tree.mu_cache = [] if opt.eval and batch_idx != 0 and batch_idx % opt.eval_every == 0: # evaluate model eval_loss, MAE, CS = evaluate(model, eval_set, opt) # update learning rate sche.step(MAE.data.item()) # record the final MAE if epoch == opt.epochs: last_MAE = MAE # record the best MAE if MAE < min_MAE: min_MAE = MAE # save the best model model_name = opt.model_type + train_set.name best_model_path = os.path.join(best_model_dir, model_name) utils.save_best_model(model.cpu(), best_model_path) model.cuda() # update log utils.update_log( best_model_dir, (str(MAE.data.item()), str(min_MAE.data.item())), str(CS)) if opt.history: eval_loss_history.append( (epoch, batch_idx, eval_loss, MAE)) # reset to training mode model.train() best_MAE.append(min_MAE.data.item()) if opt.history: utils.save_history(np.array(train_loss_history), np.array(eval_loss_history), opt) logging.info('Training finished.') return model, best_MAE, last_MAE
input_target_key=global_params["input_target_key"], writer=train_writer) strong = config["inference"]["prediction_type"] == "strong" valid_loss, valid_score = eval_one_epoch( model, loaders["valid"], criterion, device, input_key=global_params["input_key"], input_target_key=global_params["input_target_key"], epoch=epoch, writer=valid_writer, strong=strong) best_score, updated = utils.save_best_model( model, checkpoints_dir, valid_score, prev_metric=best_score) if updated: _metrics["best"] = {"lwlrap": best_score, "loss": valid_loss, "epoch": epoch + 1} _metrics["last"] = {"lwlrap": valid_score, "loss": valid_loss, "epoch": epoch + 1} _metrics[f"epoch_{epoch + 1}"] = {"lwlrap": valid_score, "loss": valid_loss} utils.save_json(_metrics, checkpoints_dir / "_metrics.json") logger.info( f"{epoch + 1}/{global_params['num_epochs']} * Epoch {epoch + 1} " f"(train): lwlrap={train_score:.4f} | loss={train_loss:.4f}") logger.info( f"{epoch + 1}/{global_params['num_epochs']} * Epoch {epoch + 1} " f"(valid): lwlrap={valid_score:.4f} | loss={valid_loss:.4f}") logger.info(
def on_after_epoch(model, df_hist, epoch): utils.save_best_model(MODEL_PATH, model, df_hist) utils.log_hist(logger, df_hist) utils.write_on_board_losses_stg1(writer, df_hist)