def test_epoch(test_loader, model, test_meter, cur_epoch): """Evaluates the model on the test set.""" # Enable eval mode model.eval() test_meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(test_loader): # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Compute the predictions preds = model(inputs) # Compute the errors top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5]) # Combine the errors across the GPUs if cfg.NUM_GPUS > 1: top1_err, top5_err = du.scaled_all_reduce([top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point) top1_err, top5_err = top1_err.item(), top5_err.item() test_meter.iter_toc() # Update and log stats test_meter.update_stats(top1_err, top5_err, inputs.size(0) * cfg.NUM_GPUS) test_meter.log_iter_stats(cur_epoch, cur_iter) test_meter.iter_tic() # Log epoch stats test_meter.log_epoch_stats(cur_epoch) test_meter.reset()
def train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch, writer_train=None, params=0, flops=0, is_master=False): """Performs one epoch of training.""" # Shuffle the data loader.shuffle(train_loader, cur_epoch) # Update the learning rate lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer, lr) # Enable training mode model.train() train_meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(train_loader): # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Perform the forward pass preds = model(inputs) # Compute the loss loss = loss_fun(preds, labels) # Perform the backward pass optimizer.zero_grad() loss.backward() # Update the parameters optimizer.step() # Compute the errors top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5]) # Combine the stats across the GPUs if cfg.NUM_GPUS > 1: loss, top1_err, top5_err = du.scaled_all_reduce( [loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point) loss, top1_err, top5_err = loss.item(), top1_err.item(), top5_err.item( ) train_meter.iter_toc() # Update and log stats train_meter.update_stats(top1_err, top5_err, loss, lr, inputs.size(0) * cfg.NUM_GPUS) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch, writer_train, params, flops, is_master=is_master) train_meter.reset()
def test_epoch(test_loader, model, ssl_model, test_meter, cur_epoch): """Evaluates the model on the test set.""" global plot_epoch_xvalues global plot_epoch_yvalues global plot_it_x_values global plot_it_y_values if torch.cuda.is_available(): model.cuda() ssl_model.cuda() # Enable eval mode model.eval() ssl_model.eval() test_meter.iter_tic() misclassifications = 0. totalSamples = 0. ssl_model.penultimate_active = True for cur_iter, (inputs, labels) in enumerate(test_loader): with torch.no_grad(): # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) inputs = inputs.type(torch.cuda.FloatTensor) # Get representations with torch.no_grad(): inputs, _ = ssl_model(inputs) # Compute the predictions preds = model(inputs) # Compute the errors top1_err, top4_err = mu.topk_errors(preds, labels, [1, 4]) # Combine the errors across the GPUs # if cfg.NUM_GPUS > 1: # top1_err = du.scaled_all_reduce([top1_err]) # #as above returns a list # top1_err = top1_err[0] # Copy the errors from GPU to CPU (sync point) top1_err = top1_err.item() # Multiply by Number of GPU's as top1_err is scaled by 1/Num_GPUs misclassifications += top1_err * inputs.size(0) * cfg.NUM_GPUS totalSamples += inputs.size(0) * cfg.NUM_GPUS test_meter.iter_toc() # Update and log stats test_meter.update_stats(top1_err=top1_err, mb_size=inputs.size(0) * cfg.NUM_GPUS) test_meter.log_iter_stats(cur_epoch, cur_iter) test_meter.iter_tic() # Log epoch stats test_meter.log_epoch_stats(cur_epoch) test_meter.reset() return misclassifications / totalSamples
def eval_epoch(test_loader, model, test_meter, cur_epoch, writer_eval=None, params=0, flops=0, is_master=False): """Evaluates the model on the test set.""" # Enable eval mode model.eval() test_meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(test_loader): # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Compute the predictions preds = model(inputs) # Compute the errors top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5]) # Combine the errors across the GPUs if cfg.NUM_GPUS > 1: top1_err, top5_err = du.scaled_all_reduce([top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point) top1_err, top5_err = top1_err.item(), top5_err.item() test_meter.iter_toc() # Update and log stats test_meter.update_stats(top1_err, top5_err, inputs.size(0) * cfg.NUM_GPUS) test_meter.log_iter_stats(cur_epoch, cur_iter) test_meter.iter_tic() # Log epoch stats # test_meter.log_epoch_stats(cur_epoch,writer_eval,params,flops) test_meter.log_epoch_stats(cur_epoch, writer_eval, params, flops, model, is_master=is_master) stats = test_meter.get_epoch_stats(cur_epoch) test_meter.reset() if cfg.RGRAPH.SAVE_GRAPH: adj_dict = nu.model2adj(model) adj_dict = {**adj_dict, 'top1_err': stats['top1_err']} os.makedirs('{}/graphs/{}'.format(cfg.OUT_DIR, cfg.RGRAPH.SEED_TRAIN), exist_ok=True) np.savez( '{}/graphs/{}/{}.npz'.format(cfg.OUT_DIR, cfg.RGRAPH.SEED_TRAIN, cur_epoch), **adj_dict)
def train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch, cfg, clf_iter_count, clf_change_lr_iter, clf_max_iter): """Performs one epoch of training.""" global plot_epoch_xvalues global plot_epoch_yvalues global plot_it_x_values global plot_it_y_values # Shuffle the data #loader.shuffle(train_loader, cur_epoch) if cfg.NUM_GPUS > 1: train_loader.sampler.set_epoch(cur_epoch) # Update the learning rate # Currently we only support LR schedules for only 'SGD' optimizer lr = optim.get_epoch_lr(cfg, cur_epoch) if cfg.OPTIM.TYPE == "sgd": optim.set_lr(optimizer, lr) if torch.cuda.is_available(): model.cuda() # Enable training mode model.train() train_meter.iter_tic( ) #This basically notes the start time in timer class defined in utils/timer.py len_train_loader = len(train_loader) for cur_iter, (inputs, labels) in enumerate(train_loader): #ensuring that inputs are floatTensor as model weights are inputs = inputs.type(torch.cuda.FloatTensor) inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Perform the forward pass preds = model(inputs) # Compute the loss loss = loss_fun(preds, labels) # Perform the backward pass optimizer.zero_grad() loss.backward() # Update the parametersSWA optimizer.step() # Compute the errors top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5]) # Combine the stats across the GPUs # if cfg.NUM_GPUS > 1: # #Average error and losses across GPUs # #Also this this calls wait method on reductions so we are ensured # #to obtain synchronized results # loss, top1_err = du.scaled_all_reduce( # [loss, top1_err] # ) # Copy the stats from GPU to CPU (sync point) loss, top1_err = loss.item(), top1_err.item() # #Only master process writes the logs which are used for plotting # if du.is_master_proc(): if True: if cur_iter != 0 and cur_iter % 19 == 0: #because cur_epoch starts with 0 plot_it_x_values.append((cur_epoch) * len_train_loader + cur_iter) plot_it_y_values.append(loss) save_plot_values( [plot_it_x_values, plot_it_y_values], ["plot_it_x_values.npy", "plot_it_y_values.npy"], out_dir=cfg.EPISODE_DIR, isDebug=False) # print(plot_it_x_values) # print(plot_it_y_values) #Plot loss graphs plot_arrays( x_vals=plot_it_x_values, y_vals=plot_it_y_values, x_name="Iterations", y_name="Loss", dataset_name=cfg.DATASET.NAME, out_dir=cfg.EPISODE_DIR, ) #Compute the difference in time now from start time initialized just before this for loop. train_meter.iter_toc() train_meter.update_stats(top1_err=top1_err, loss=loss, \ lr=lr, mb_size=inputs.size(0) * cfg.NUM_GPUS) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch) train_meter.reset() return loss, clf_iter_count