def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 # learning rate schedulers for different models: if params.model_version == "resnet18": scheduler = StepLR(optimizer, step_size=150, gamma=0.1) # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here elif params.model_version == "cnn": scheduler = StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc>=best_val_acc # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict' : optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, restore_file=None, scheduler=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from %s" % restore_path) utils.load_checkpoint(restore_path, model, optimizer if params.optim_restore else None) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) if scheduler is not None: scheduler.step() # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params, epoch) # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params) val_acc = val_metrics['macro_f1'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best macro_f1") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model, train_loader, test_loader, optimizer, criterion, accuracy, model_dir, args): start_epoch = 0 best_acc = 0.0 # learning rate schedulers for different models: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=0.1) # TensorboardX setup writer = SummaryWriter(log_dir=model_dir) # Save best accTop1 choose_accTop1 = True # Save the parameters for export result_train_metrics = list(range(args.num_epochs)) result_test_metrics = list(range(args.num_epochs)) # If the training is interruptted if args.resume: # Load checkpoint. logging.info('Resuming from checkpoint..') resumePath = os.path.join(args.resume, 'last.pth') assert os.path.isfile( resumePath), 'Error: no checkpoint directory found!' checkpoint = torch.load(resumePath) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optim_dict']) # resume from the last epoch start_epoch = checkpoint['epoch'] scheduler.step(start_epoch - 1) if choose_accTop1: best_acc = checkpoint['test_accTop1'] else: best_acc = checkpoint['test_accTop5'] result_train_metrics = torch.load( os.path.join(args.resume, 'train_metrics')) result_test_metrics = torch.load( os.path.join(args.resume, 'test_metrics')) for epoch in range(start_epoch, args.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, args.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_metrics = train(train_loader, model, optimizer, criterion, accuracy, args) writer.add_scalar('Train/Loss', train_metrics['train_loss'], epoch + 1) writer.add_scalar('Train/AccTop1', train_metrics['train_accTop1'], epoch + 1) writer.add_scalar('Train/AccTop5', train_metrics['train_accTop5'], epoch + 1) # Evaluate for one epoch on validation set test_metrics = evaluate(test_loader, model, criterion, accuracy, args) # Find the best accTop1 model. if choose_accTop1: test_acc = test_metrics['test_accTop1'] else: test_acc = test_metrics['test_accTop5'] writer.add_scalar('Test/Loss', test_metrics['test_loss'], epoch + 1) writer.add_scalar('Test/AccTop1', test_metrics['test_accTop1'], epoch + 1) writer.add_scalar('Test/AccTop5', test_metrics['test_accTop5'], epoch + 1) result_train_metrics[epoch] = train_metrics result_test_metrics[epoch] = test_metrics # Save latest train/test metrics torch.save(result_train_metrics, os.path.join(model_dir, 'train_metrics')) torch.save(result_test_metrics, os.path.join(model_dir, 'test_metrics')) last_path = os.path.join(model_dir, 'last.pth') # Save latest model weights, optimizer and accuracy torch.save( { 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict(), 'epoch': epoch + 1, 'test_accTop1': test_metrics['test_accTop1'], 'test_accTop5': test_metrics['test_accTop5'] }, last_path) # If best_eval, best_save_path is_best = test_acc >= best_acc if is_best: logging.info("- Found better accuracy") best_acc = test_acc # Save best metrics in a json file in the model directory test_metrics['epoch'] = epoch + 1 utils.save_dict_to_json( test_metrics, os.path.join(model_dir, "test_best_metrics.json")) # Save model and optimizer shutil.copyfile(last_path, os.path.join(model_dir, 'best.pth')) writer.close()
model = getattr(model_cfg, args.model)(num_classes=num_classes) if torch.cuda.device_count() > 1: model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]).to(device) else: model = model.to(device) num_params = (sum(p.numel() for p in model.parameters()) / 1000000.0) logging.info('Total params: %.2fM' % num_params) # Loss and optimizer criterion = nn.CrossEntropyLoss() accuracy = utils.accuracy optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay=args.wd) # Train the model logging.info("Starting training for {} epoch(s)".format(args.num_epochs)) train_and_evaluate(model, train_loader, test_loader, optimizer, criterion, accuracy, model_dir, args) logging.info('Total time: {:.2f} minutes'.format( (time.time() - begin_time) / 60.0)) state['Total params'] = num_params params_json_path = os.path.join(model_dir, "parameters.json") # save parameters utils.save_dict_to_json(state, params_json_path)
{ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_Dict': optimizer.state_dict() }, is_best=is_best, checkdir=args.exp_dir) # Write performance and args to json prfs_name = os.path.basename(args.exp_dir) + '_prfs.json' prfs_path = os.path.join(args.exp_dir, prfs_name) with open(prfs_path, 'w') as fout: json.dump(output_dict, fout, indent=4) #%% Test if args.save_model: pth_dir = 'bioqa/exps/psci/list/test' utils.load_checkpoint(os.path.join(args.exp_dir, 'best.pth.tar'), model) test_scores = valid_fn_list(model, test_loader, tokenizer, device, args.num_answer, args.ans_thres) save_path = os.path.join(args.exp_dir, "test_scores.json") utils.save_dict_to_json(test_scores, save_path) print( '[Test] loss: {0:.3f} | f1: {1:.2f}% | prec: {2:.2f}% | rec: {3:.2f}%\n' .format(test_scores['loss'], test_scores['f1'] * 100, test_scores['prec'] * 100, test_scores['rec'] * 100)) #%% plot # utils.plot_prfs(prfs_path)
def main_train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, params, model_dir, restore_file=None, tb_writer=None, device='cpu', save_each_epoch=False, evol_val=True): """ Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_data: (dict) training data with keys 'data' and 'labels' val_data: (dict) validation data with keys 'data' and 'labels' optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch params: (Params) hyperparameters/arguments model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) tb_writer: (SummaryWriter) tensorboard writer device: (string) cpu or cuda device save_each_epoch: (bool) save model parameters after each epoch if it's set True evol_val: (bool) progress of validation error """ if save_each_epoch: utils.save_checkpoint( { 'epoch': 0, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=False, checkpoint=model_dir, save_last=False, save_each=save_each_epoch) # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(model_dir, restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) checkpoint_dict = utils.load_checkpoint(restore_path, model, optimizer) epoch_start_ind = checkpoint_dict['epoch'] else: epoch_start_ind = 0 if params.score_to_select == 'loss': best_val_score = np.inf else: best_val_score = 0.0 # 0.0 9f accuracy is used then it's 0.0 and best value is compared >= # if evol_val: prog_val = [] for epoch in range(epoch_start_ind, params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_metrics_mean = train_model(model, optimizer, loss_fn, train_data, params, epoch=epoch, device=device) # Evaluate for one epoch on validation set print('starting to evaluate') val_metrics = evaluate(model, loss_fn, val_data, params, device=device) # val_score = val_metrics[params.score_to_select] if params.score_to_select == 'loss': is_best = val_score <= best_val_score else: is_best = val_score >= best_val_score # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir, save_last=True, save_each=save_each_epoch) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_score = val_score # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path) if evol_val: prog_val.append(val_metrics) if tb_writer: tb_writer.add_scalars('Loss', { 'train': train_metrics_mean['loss'], 'val': val_metrics['loss'] }, epoch) tb_writer.add_scalars('Val/nDCG', { key: val_metrics[key] for key in ['nDCG1', 'nDCG5', 'nDCG10'] }, epoch) tb_writer.add_scalars( 'Val/P', {key: val_metrics[key] for key in ['P1', 'P5', 'P10']}, epoch) print('Epoch: {} | Validation loss: {}'.format( epoch, val_metrics['loss']), flush=True) print('Validation loss: {}'.format(val_metrics['loss']), flush=True) if evol_val: pickle.dump(prog_val, open(os.path.join(model_dir, 'val_metrics_s.pkl'), 'wb')) logging.info('done training and validation.')
def train_evaluate_model(word_attn, sent_attn, data_train, data_val, word_optmizer, sent_optimizer, params, model_dir, restore_file, vocab_to_index): # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') utils.log("Restoring parameters from {}".format(restore_path), logger) utils.load_checkpoint(restore_path, word_attn, word_optmizer, spinn=True) utils.load_checkpoint(restore_path, sent_attn, sent_optimizer, spinn=False) best_val_acc = 0.0 i = 0 for epoch in range(params.num_epochs): # Run one epoch utils.log("Epoch {}/{}".format(epoch + 1, params.num_epochs), logger) # compute number of batches in one epoch (one full pass over the training set) ### RUn the model over 1 epoch... num_steps = (params.train_size + 1) // params.batch_size print(num_steps) train_model(data_train, word_attn, sent_attn, word_optmizer, sent_optimizer, params, num_steps, vocab_to_index) utils.log("-----Starting Evaluation-----", logger) num_steps = (params.val_size + 1) // params.batch_size val_metrics = evaluate(data_val, word_attn, sent_attn, params, num_steps, vocab_to_index) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'Word_State_dict': word_attn.state_dict(), 'Word_Optim_dict': word_optmizer.state_dict(), 'Sent_State_dict': sent_attn.state_dict(), 'Sent_Optim_dict': sent_optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) if is_best: utils.log("- Found new best accuracy", logger) best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
'[Valid] loss: {0:.3f} | f1: {1:.2f}% | prec: {2:.2f}% | rec: {3:.2f}%\n' .format(valid_scores['loss'], valid_scores['f1'] * 100, valid_scores['prec'] * 100, valid_scores['rec'] * 100)) # Update output dictionary output_dict['prfs'][str('train_' + str(epoch + 1))] = train_scores output_dict['prfs'][str('valid_' + str(epoch + 1))] = valid_scores # Save scores # if valid_scores['loss'] < min_valid_loss: # min_valid_loss = valid_scores['loss'] is_best = (valid_scores['f1'] > max_valid_f1) if is_best == True: max_valid_f1 = valid_scores['f1'] utils.save_dict_to_json( valid_scores, os.path.join(args.exp_dir, 'best_val_scores.json')) # Save model if args.save_model == True: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_Dict': optimizer.state_dict() }, is_best=is_best, checkdir=args.exp_dir) # Early stopping # if valid_scores['loss']-min_valid_loss > 0: # args.stop_c1) and (max_valid_f1-valid_scores['f1'] > args.stop_c2): # n_worse += 1
def train_and_evaluate(model, trainloader, validloader, optimizer, criterion, metrics, params, model_dir, restore_file=None): ## Train the model and evaluate every epoch. # Reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(model_dir, restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_valid_acc = 0 if params.model_version == "resnet18": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.1) elif params.model_version == "cnn": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(params.num_epochs): logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) train(model, optimizer, criterion, trainloader, metrics, params) scheduler.step() valid_metrics = evaluate(model, criterion, validloader, metrics, params) valid_acc = valid_metrics['accuracy'] is_best = valid_acc >= best_valid_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) if is_best: logging.info("- Found new best accuracy") best_valid_acc = valid_acc # Save best validation metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_valid_best_weights.json") utils.save_dict_to_json(valid_metrics, best_json_path) # Save latest valid metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_valid_last_weights.json") utils.save_dict_to_json(valid_metrics, last_json_path) # if __name__ == '__main__': # # Load the parameters from json file # parser = argparse.ArgumentParser() # parser.add_argument('--model_dir', default='experiments/baseline_standalone', help='Directory containing params.json') # parser.add_argument('--restore_file', default=None, # help='Optional, name of the file in --model_dir containing weights to reload before training') ## 'best' or 'train' # args = parser.parse_args() # json_path = os.path.join(args.model_dir, 'params.json') # assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) # params = utils.Params(json_path) # # # use GPU if available # params.cuda = torch.cuda.is_available() # # # Set the random seed for reproducible experiments # random.seed(230) # torch.manual_seed(230) # if params.cuda: torch.cuda.manual_seed(230) # # # Set the logger # utils.set_logger(os.path.join(args.model_dir, 'train.log')) # # # Create the input data pipeline # logging.info("Loading the datasets...") # # # fetch dataloaders, considering full-set vs. sub-set scenarios # if params.subset_percent < 1.0: # trainloader = datautils.fetch_subset_dataloader('train', params) # else: # trainloader = datautils.fetch_dataloader('train', params) # # testloader = datautils.fetch_dataloader('test', params) # # logging.info("- done.") # # model = resnet.ResNet18().cuda() if params.cuda else resnet.ResNet18() # optimizer = optim.SGD(model.parameters(), lr=params.learning_rate, momentum=0.9, weight_decay=5e-4) # # fetch loss function and metrics # loss_fn = utils.loss_function # metrics = utils.metrics # # # Train the model # logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) # train_and_evaluate(model, trainloader, testloader, optimizer, loss_fn, metrics, params, args.model_dir, args.restore_file)
data = json.load(f) best_val_acc = data['accuracy'] f.close() for epoch in range(args.max_epochs): train(train_set, train_set2, model, args, 'train') val_acc = val(val_set, val_set2, model, args, 'val') val_metrics = {'accuracy': val_acc} is_best = val_acc >= best_val_acc utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=args.model_dir) if is_best: logging.info('- Found new best accuracy') counter = 0 # reset counter best_val_acc = val_acc best_json_path = os.path.join( args.model_dir, 'val_best_weights.json') utils.save_dict_to_json(val_metrics, best_json_path) else: counter += 1 if counter > patience: logging.info('- No improvement in a while, stopping training...') last_json_path = os.path.join( args.model_dir, 'val_last_weights.json') utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, optimizer: optim, loss_fn, params: utils.Params, restore_file: str = None) -> None: '''Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the Deep AR model train_loader: load train data and labels test_loader: load test data and labels optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch params: (Params) hyperparameters restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) ''' # reload weights from restore_file if specified restore_epoch = 0 if restore_file is not None: restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar') logger.info('Restoring parameters from {}'.format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) restore_epoch = int(restore_file[-2:].replace('_',''))+1 logger.info('Restoring epoch: {}'.format(restore_epoch)) logger.info('Begin training and evaluation') # initialize the early_stopping object early_stopping = EarlyStopping(patience=25, verbose=True, delta=0.0001, folder=params.model_dir) if os.path.exists(os.path.join(params.model_dir, 'metrics_test_best_weights.json')): with open(os.path.join(params.model_dir, 'metrics_test_best_weights.json')) as json_file: best_test_ND = json.load(json_file)['ND'] early_stopping.best_score = best_test_ND else: best_test_ND = float('inf') early_stopping.best_score = best_test_ND train_len = len(train_loader) ND_summary = np.zeros(params.num_epochs) loss_summary = np.zeros((train_len * params.num_epochs)) for epoch in range(restore_epoch, params.num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs)) loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(model, optimizer, loss_fn, train_loader, test_loader, params, epoch) test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling) # if test_metrics['ND'] == float('nan'): # test_metrics['ND'] = 1000 # print('NAN ') # elif test_metrics['ND'] == np.nan: # print('NAN ') # test_metrics['ND'] = 1000 ND_summary[epoch] = test_metrics['ND'] ##################################'ND' is_best = ND_summary[epoch] <= best_test_ND # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, epoch=epoch, is_best=is_best, checkpoint=params.model_dir) if is_best: logger.info('- Found new best ND') ############# 'ND' best_test_ND = ND_summary[epoch] best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json') utils.save_dict_to_json(test_metrics, best_json_path) logger.info('Current Best ND is: %.5f' % best_test_ND) ## 'ND' utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir) utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir) last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json') utils.save_dict_to_json(test_metrics, last_json_path) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model logger.info('ND : %.5f ' % test_metrics['ND']) early_stopping(test_metrics['ND'], model) if early_stopping.early_stop: logger.info('Early stopping') break # # load the last checkpoint with the best model # model.load_state_dict(torch.load('checkpoint.pt')) if args.save_best: f = open('./param_search.txt', 'w') f.write('-----------\n') list_of_params = args.search_params.split(',') print_params = '' for param in list_of_params: param_value = getattr(params, param) print_params += f'{param}: {param_value:.2f}' print_params = print_params[:-1] f.write(print_params + '\n') f.write('Best ND: ' + str(best_test_ND) + '\n') logger.info(print_params) logger.info(f'Best ND: {best_test_ND}') f.close() utils.plot_all_epoch(ND_summary, print_params + '_ND', location=params.plot_dir) utils.plot_all_epoch(loss_summary, print_params + '_loss', location=params.plot_dir)
def train_and_evaluate2(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, optimizer: optim, params: utils.Params, loss_fn: None, restore_file: None, args: None, idx: None) -> None: '''Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the Deep AR model train_loader: load train data and labels test_loader: load test data and labels optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch params: (Params) hyperparameters restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) ''' # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar') logger.info('Restoring parameters from {}'.format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) logger.info('begin training and evaluation') best_test_ND = float('inf') # File to save first results out_file = os.path.join(os.path.join('experiments', args.model_name), 'train_results.csv') if not os.path.isfile(out_file): of_connection = open(out_file, 'w') writer = csv.writer(of_connection) # Write the headers to the file writer.writerow(['iteration', 'epoch', 'test_metric', 'train_loss']) of_connection.close() train_len = len(train_loader) ND_summary = np.zeros(params.num_epochs) loss_summary = np.zeros((train_len * params.num_epochs)) # initialize the early_stopping object early_stopping = EarlyStopping(patience=5, verbose=True, delta=0.0001, folder=params.model_dir) for epoch in range(params.num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs)) loss_summary[epoch * train_len:(epoch + 1) * train_len] = train( model, optimizer, loss_fn, train_loader, test_loader, params, args.sampling, epoch) test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling) if test_metrics['rou50'] == float('nan'): test_metrics['rou50'] = 100 elif test_metrics['rou50'] == 'nan': test_metrics['rou50'] = 100 elif test_metrics['rou50'] == np.nan: test_metrics['rou50'] = 100 ND_summary[epoch] = test_metrics['rou50'] is_best = ND_summary[epoch] <= best_test_ND # Save weights utils.save_checkpoint( { 'epoch': 0, #epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, epoch=0, # to prevent extra model savings is_best=is_best, checkpoint=params.model_dir) if is_best: logger.info('- Found new best ND') best_test_ND = ND_summary[epoch] best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json') utils.save_dict_to_json(test_metrics, best_json_path) logger.info('Current Best loss is: %.5f' % best_test_ND) #if args.plot_figure: # utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir) # utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir) last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json') utils.save_dict_to_json(test_metrics, last_json_path) # Write to the csv file ('a' means append) of_connection = open(out_file, 'a') writer = csv.writer(of_connection) writer.writerow([idx, epoch + 1, test_metrics, loss_summary[-1]]) #loss_summary[0]?? of_connection.close() logger.info('Loss_summary: ' % loss_summary[epoch * train_len:(epoch + 1) * train_len]) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model logger.info('test_metrics[rou50]: %.5f ' % test_metrics['rou50']) early_stopping(test_metrics['rou50'], model) if early_stopping.early_stop: logger.info('Early stopping') break with open(best_json_path) as json_file: best_metrics = json.load(json_file) return best_metrics, test_metrics
def train_and_evaluate(netG, netD, train_dataloader, val_dataloader, optimG, optimD, loss_fn, metrics, params, model_dir, restore_file=None, cuda_id=0): # reload weights from restore_file if specified if restore_file is not None: restore_path_g = os.path.join(args.model_dir, 'best_g' + '.pth.tar') restore_path_d = os.path.join(args.model_dir, 'best_d' + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path_g)) utils.load_checkpoint(restore_path_g, netG, optimG) utils.load_checkpoint(restore_path_d, netD, optimD) best_val_acc = 0.0 # train add logger,epoch two parameters # logger = Logger('./logs') for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(netG, netD, optimG, optimD, loss_fn, train_dataloader, metrics, params, cuda_id) # Evaluate for one epoch on validation set val_metrics = evaluate(netG, netD, loss_fn, val_dataloader, metrics, params, cuda_id) #print ('after val --------') val_acc = val_metrics['PSNR'] is_best = val_acc >= best_val_acc #Save weights # save G flag = 'G' utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': netG.state_dict(), 'optim_dict': optimG.state_dict() }, is_best=is_best, checkpoint=model_dir, flag=flag) flag = 'D' # save D utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': netD.state_dict(), 'optim_dict': optimD.state_dict() }, is_best=is_best, checkpoint=model_dir, flag=flag) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path) if epoch % 100 == 0 and epoch > 99: plt.plot(global_loss_g) plt.savefig(str(epoch) + " epoch_g.jpg") plt.plot(global_loss_d) plt.savefig(str(epoch) + " epoch_d.jpg") plt.plot(global_loss_g) plt.savefig("final loss_g.jpg") plt.plot(global_loss_d) plt.savefig("final loss_d.jpg")
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, restore_file=None, cuda_id=0): # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 ''' # train add logger,epoch two parameters ''' logger = Logger('./logs') scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=1) for epoch in range(params.num_epochs): # Run one epoch # scheduler.step() logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params, logger, epoch, cuda_id) # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, cuda_id) val_acc = val_metrics['PSNR'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path) plt.plot(global_loss) plt.savefig("final loss.jpg")
def train_evaluate(model, train_iterator, valid_iterator, criterion, optimizer, metrics, args, restore_file=None): """ """ if os.path.exists(args.exp_dir) == False: os.makedirs(args.exp_dir) if restore_file is not None: restore_path = os.path.join(args.exp_dir, restore_file + '.pth.tar') logging.info("Restoring parameters from {}...".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) # For early stopping n_worse = 0 min_valid_loss = float('inf') max_valid_f1 = -float('inf') # Create args and output dictionary (for json output) output_dict = {'args': vars(args), 'prfs': {}} for epoch in range(args.num_epochs): train_scores = train(model, train_iterator, criterion, optimizer, metrics, args.threshold) valid_scores = evaluate(model, valid_iterator, criterion, metrics, args.threshold) # Update output dictionary output_dict['prfs'][str('train_' + str(epoch + 1))] = train_scores output_dict['prfs'][str('valid_' + str(epoch + 1))] = valid_scores # Save scores if valid_scores['loss'] < min_valid_loss: min_valid_loss = valid_scores['loss'] if valid_scores['f1'] > max_valid_f1: max_valid_f1 = valid_scores['f1'] is_best = (valid_scores['loss'] - min_valid_loss <= args.stop_c1) and ( max_valid_f1 - valid_scores['f1'] <= args.stop_c2) if is_best == True: utils.save_dict_to_json( valid_scores, os.path.join(args.exp_dir, 'best_val_scores.json')) # Save model if args.save_model == True: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_Dict': optimizer.state_dict() }, is_best=is_best, checkdir=args.exp_dir) # Save the latest valid scores in exp_dir # utils.save_dict_to_json(valid_scores, os.path.join(exp_dir, 'last_val_scores.json')) print("\n\nEpoch {}/{}...".format(epoch + 1, args.num_epochs)) print( '\n[Train] loss: {0:.3f} | acc: {1:.2f}% | f1: {2:.2f}% | recall: {3:.2f}% | precision: {4:.2f}% | specificity: {5:.2f}%' .format(train_scores['loss'], train_scores['accuracy'] * 100, train_scores['f1'] * 100, train_scores['recall'] * 100, train_scores['precision'] * 100, train_scores['specificity'] * 100)) print( '[Val] loss: {0:.3f} | acc: {1:.2f}% | f1: {2:.2f}% | recall: {3:.2f}% | precision: {4:.2f}% | specificity: {5:.2f}%\n' .format(valid_scores['loss'], valid_scores['accuracy'] * 100, valid_scores['f1'] * 100, valid_scores['recall'] * 100, valid_scores['precision'] * 100, valid_scores['specificity'] * 100)) # Early stopping if (valid_scores['loss'] - min_valid_loss > args.stop_c1) and ( max_valid_f1 - valid_scores['f1'] > args.stop_c2): n_worse += 1 if n_worse == args.stop_p: print("Early stopping") break # Write performance and args to json prfs_name = os.path.basename(args.exp_dir) + '_prfs.json' prfs_path = os.path.join(args.exp_dir, prfs_name) with open(prfs_path, 'w') as fout: json.dump(output_dict, fout, indent=4)
def train_and_evaluate_kd(model, teacher_model, trainloader, validloader, optimizer, criterion_kd, metrics, params, model_dir, restore_file=None): """ Train the model and evaluate every epoch :param model: (torch.nn.Module) the neural network :param teacher_model: (Params) hyperparameters :param model_dir: (string) directory containing config, weights and log :param restore_file: (string) - file to restore (without its extension .ptr.tar) """ # Reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(model_dir, restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_valid_acc = 0.0 # TensorBoard logger setup # board_logger = utils.Board_logger(os.path.join(model_dir, 'board_logs')) # Learning rate scedulers for different models: if params.model_version == "resnet18_distill": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.1) elif params.model_version == "cnn_distill": # For cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1) for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch+1, params.num_epochs)) # Compute number of batches in one epoch (one full pass over the training set train_kd(model, teacher_model, optimizer, criterion_kd, trainloader, metrics, params) # Evaluate for one epoch on validation set valid_metrics = evaluate_kd(model, validloader, metrics, params) valid_acc = valid_metrics['accuracy'] is_best = valid_acc >= best_valid_acc # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_valid_acc = valid_acc # Save best valid metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_valid_best_weights.json") utils.save_dict_to_json(valid_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_valid_last_weights.json") utils.save_dict_to_json(valid_metrics, last_json_path) #============ TensorBoard logging: uncomment below to turn in on ============# # # (1) Log the scalar values # info = { # 'valid accuracy': valid_acc # } # for tag, value in info.items(): # board_logger.scalar_summary(tag, value, epoch+1) # # (2) Log values and gradients of the parameters (histogram) # for tag, value in model.named_parameters(): # tag = tag.replace(',', '/') # board_logger.histo_summary(tag, value.data.cpu().numpy(), epoch+1) # # board_logger.histo_summary(tag+'/grad', value.grad.data.cpu().numpy(), epoch+1)
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, critierion, metrics, params, model_dir, restore_file=None): # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 best_val_metrics = [] learning_rate_0 = params.learning_rate train_acc_series = [] val_acc_series = [] train_loss_series = [] for epoch in range(params.num_epochs): logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # train model train_metrics = train(model, train_dataloader, optimizer, critierion, metrics, params) # learning rate exponential decay params.learning_rate = learning_rate_0 * np.exp( -params.exp_decay_k * epoch) # evaluate val_metrics = evaluate(model, critierion, val_dataloader, metrics, params) # find accuracy from validation dataset val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # save accuracy / loss to array for plot train_acc_series.append(train_metrics['accuracy']) val_acc_series.append(val_metrics['accuracy']) train_loss_series.append(train_metrics['loss']) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc best_val_metrics = val_metrics # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path) print('******************************************') # plot visualized performance visualize.plot_train_val_accuracy(train_acc_series, val_acc_series) visualize.plot_loss(train_loss_series) # save best validation F1 score plot visualize.plot_individual_label_f1score(best_val_metrics)