def train_epoch(loader, model, loss_fun, optimizer, scaler, meter, cur_epoch): """Performs one epoch of training.""" # Shuffle the data data_loader.shuffle(loader, cur_epoch) # Update the learning rate lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer, lr) # Enable training mode model.train() meter.reset() meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(loader): # Transfer the data to the current GPU device inputs = inputs.npu() labels = labels.to(torch.int32).npu() labels = labels.to(non_blocking=False) # Convert labels to smoothed one-hot vector p_labels = labels[:] labels_one_hot = net.smooth_one_hot_labels(labels).npu() # Apply mixup to the batch (no effect if mixup alpha is 0) inputs, labels_one_hot, labels = net.mixup(inputs, labels_one_hot) # Perform the forward pass and compute the loss preds = model(inputs) loss = loss_fun(preds, labels_one_hot) stream = torch.npu.current_stream() stream.synchronize() # Perform the backward pass and update the parameters optimizer.zero_grad() stream = torch.npu.current_stream() stream.synchronize() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() stream = torch.npu.current_stream() stream.synchronize() optimizer.step() stream = torch.npu.current_stream() stream.synchronize() # Compute the errors top1_err, top5_err = meters.topk_errors(preds, p_labels, [1, 5]) # Combine the errors across the GPUs (no reduction if 1 GPU used) # Combine the stats across the GPUs (no reduction if 1 GPU used) # loss, top1_err, top5_err = dist.scaled_all_reduce([loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point) loss, top1_err, top5_err = loss.item(), top1_err.item(), top5_err.item( ) meter.iter_toc() # Update and log stats mb_size = inputs.size(0) * cfg.NUM_GPUS meter.update_stats(top1_err, top5_err, loss, lr, mb_size) meter.log_iter_stats(cur_epoch, cur_iter) meter.iter_tic() # Log epoch stats meter.log_epoch_stats(cur_epoch)
def train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch): """Performs one epoch of training.""" # Shuffle the data loader.shuffle(train_loader, cur_epoch) # Update the learning rate lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer, lr) # Enable training mode model.train() train_meter.reset() train_meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(train_loader): # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Perform the forward pass preds = model(inputs) # Compute the loss loss = loss_fun(preds, labels) # Perform the backward pass optimizer.zero_grad() loss.backward() # Update the parameters optimizer.step() # Compute the errors top1_err, top5_err = meters.topk_errors(preds, labels, [1, 5]) # Combine the stats across the GPUs (no reduction if 1 GPU used) loss, top1_err, top5_err = dist.scaled_all_reduce( [loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point) loss, top1_err, top5_err = loss.item(), top1_err.item(), top5_err.item( ) train_meter.iter_toc() # Update and log stats mb_size = inputs.size(0) * cfg.NUM_GPUS train_meter.update_stats(top1_err, top5_err, loss, lr, mb_size) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch) print(f'{cfg.OUT_DIR}') if not hasattr(cfg, 'search_epoch'): stats = train_meter.get_epoch_stats(cur_epoch) stats = {k: v for k, v in stats.items() if isinstance(v, (int, float))} summary_dict2txtfig(stats, prefix='train', step=cur_epoch, textlogger=textlogger, save_fig_sec=60)
def train_epoch(loader, model, ema, loss_fun, optimizer, scaler, meter, cur_epoch): """Performs one epoch of training.""" # Shuffle the data data_loader.shuffle(loader, cur_epoch) # Update the learning rate lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer, lr) # Enable training mode model.train() ema.train() meter.reset() meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(loader): # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Convert labels to smoothed one-hot vector labels_one_hot = net.smooth_one_hot_labels(labels) # Apply mixup to the batch (no effect if mixup alpha is 0) inputs, labels_one_hot, labels = net.mixup(inputs, labels_one_hot) # Perform the forward pass and compute the loss with amp.autocast(enabled=cfg.TRAIN.MIXED_PRECISION): preds = model(inputs) loss = loss_fun(preds, labels_one_hot) # Perform the backward pass and update the parameters optimizer.zero_grad() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() # Update ema weights net.update_model_ema(model, ema, cur_epoch, cur_iter) # Compute the errors top1_err, top5_err = meters.topk_errors(preds, labels, [1, 5]) # Combine the stats across the GPUs (no reduction if 1 GPU used) loss, top1_err, top5_err = dist.scaled_all_reduce( [loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point) loss, top1_err, top5_err = loss.item(), top1_err.item(), top5_err.item( ) meter.iter_toc() # Update and log stats mb_size = inputs.size(0) * cfg.NUM_GPUS meter.update_stats(top1_err, top5_err, loss, lr, mb_size) meter.log_iter_stats(cur_epoch, cur_iter) meter.iter_tic() # Log epoch stats meter.log_epoch_stats(cur_epoch)
def train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch): """Performs one epoch of training.""" # Shuffle the data loader.shuffle(train_loader, cur_epoch) # Update the learning rate lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer, lr) # Enable training mode model.train() train_meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(train_loader): # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Perform the forward pass preds = model(inputs) # Compute the loss loss = loss_fun(preds, labels) # Perform the backward pass optimizer.zero_grad() loss.backward() # Update the parameters optimizer.step() # Compute the errors top1_err, top5_err = meters.topk_errors(preds, labels, [1, 5]) # Combine the stats across the GPUs if cfg.NUM_GPUS > 1: loss, top1_err, top5_err = dist.scaled_all_reduce( [loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point) loss, top1_err, top5_err = loss.item(), top1_err.item(), top5_err.item( ) train_meter.iter_toc() # Update and log stats train_meter.update_stats(top1_err, top5_err, loss, lr, inputs.size(0) * cfg.NUM_GPUS) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch) train_meter.reset()
def train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch): """Performs one epoch of training.""" # Update drop path prob for NAS if cfg.MODEL.TYPE == "nas": m = model.module if cfg.NUM_GPUS > 1 else model m.set_drop_path_prob(cfg.NAS.DROP_PROB * cur_epoch / cfg.OPTIM.MAX_EPOCH) # Shuffle the data loader.shuffle(train_loader, cur_epoch) # Update the learning rate per epoch if not cfg.OPTIM.ITER_LR: lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer, lr) # Enable training mode model.train() train_meter.iter_tic() for cur_iter, (inputs, labels) in enumerate(train_loader): # Update the learning rate per iter if cfg.OPTIM.ITER_LR: lr = optim.get_epoch_lr(cur_epoch + cur_iter / len(train_loader)) optim.set_lr(optimizer, lr) # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Perform the forward pass preds = model(inputs) # Compute the loss if isinstance(preds, tuple): loss = loss_fun(preds[0], labels) + cfg.NAS.AUX_WEIGHT * loss_fun(preds[1], labels) preds = preds[0] else: loss = loss_fun(preds, labels) # Perform the backward pass optimizer.zero_grad() loss.backward() # Update the parameters optimizer.step() # Compute the errors if cfg.TASK == "col": preds = preds.permute(0, 2, 3, 1) preds = preds.reshape(-1, preds.size(3)) labels = labels.reshape(-1) mb_size = inputs.size(0) * inputs.size(2) * inputs.size(3) * cfg.NUM_GPUS else: mb_size = inputs.size(0) * cfg.NUM_GPUS if cfg.TASK == "seg": # top1_err is in fact inter; top5_err is in fact union top1_err, top5_err = meters.inter_union(preds, labels, cfg.MODEL.NUM_CLASSES) else: ks = [1, min(5, cfg.MODEL.NUM_CLASSES)] # rot only has 4 classes top1_err, top5_err = meters.topk_errors(preds, labels, ks) # Combine the stats across the GPUs (no reduction if 1 GPU used) loss, top1_err, top5_err = dist.scaled_all_reduce([loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point) loss = loss.item() if cfg.TASK == "seg": top1_err, top5_err = top1_err.cpu().numpy(), top5_err.cpu().numpy() else: top1_err, top5_err = top1_err.item(), top5_err.item() train_meter.iter_toc() # Update and log stats train_meter.update_stats(top1_err, top5_err, loss, lr, mb_size) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch) train_meter.reset()
def search_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch): """Performs one epoch of differentiable architecture search.""" m = model.module if cfg.NUM_GPUS > 1 else model # Shuffle the data loader.shuffle(train_loader[0], cur_epoch) loader.shuffle(train_loader[1], cur_epoch) # Update the learning rate per epoch if not cfg.OPTIM.ITER_LR: lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer[0], lr) # Enable training mode model.train() train_meter.iter_tic() trainB_iter = iter(train_loader[1]) for cur_iter, (inputs, labels) in enumerate(train_loader[0]): # Update the learning rate per iter if cfg.OPTIM.ITER_LR: lr = optim.get_epoch_lr(cur_epoch + cur_iter / len(train_loader[0])) optim.set_lr(optimizer[0], lr) # Transfer the data to the current GPU device inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Update architecture if cur_epoch + cur_iter / len(train_loader[0]) >= cfg.OPTIM.ARCH_EPOCH: try: inputsB, labelsB = next(trainB_iter) except StopIteration: trainB_iter = iter(train_loader[1]) inputsB, labelsB = next(trainB_iter) inputsB, labelsB = inputsB.cuda(), labelsB.cuda(non_blocking=True) optimizer[1].zero_grad() loss = m._loss(inputsB, labelsB) loss.backward() optimizer[1].step() # Perform the forward pass preds = model(inputs) # Compute the loss loss = loss_fun(preds, labels) # Perform the backward pass optimizer[0].zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 5.0) # Update the parameters optimizer[0].step() # Compute the errors if cfg.TASK == "col": preds = preds.permute(0, 2, 3, 1) preds = preds.reshape(-1, preds.size(3)) labels = labels.reshape(-1) mb_size = inputs.size(0) * inputs.size(2) * inputs.size(3) * cfg.NUM_GPUS else: mb_size = inputs.size(0) * cfg.NUM_GPUS if cfg.TASK == "seg": # top1_err is in fact inter; top5_err is in fact union top1_err, top5_err = meters.inter_union(preds, labels, cfg.MODEL.NUM_CLASSES) else: ks = [1, min(5, cfg.MODEL.NUM_CLASSES)] # rot only has 4 classes top1_err, top5_err = meters.topk_errors(preds, labels, ks) # Combine the stats across the GPUs (no reduction if 1 GPU used) loss, top1_err, top5_err = dist.scaled_all_reduce([loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point) loss = loss.item() if cfg.TASK == "seg": top1_err, top5_err = top1_err.cpu().numpy(), top5_err.cpu().numpy() else: top1_err, top5_err = top1_err.item(), top5_err.item() train_meter.iter_toc() # Update and log stats train_meter.update_stats(top1_err, top5_err, loss, lr, mb_size) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch) train_meter.reset() # Log genotype genotype = m.genotype() logger.info("genotype = %s", genotype) logger.info(F.softmax(m.net_.alphas_normal, dim=-1)) logger.info(F.softmax(m.net_.alphas_reduce, dim=-1))
def train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch, cfg, clf_iter_count, clf_change_lr_iter, clf_max_iter): """Performs one epoch of training.""" global plot_epoch_xvalues global plot_epoch_yvalues global plot_it_x_values global plot_it_y_values # Shuffle the data #loader.shuffle(train_loader, cur_epoch) if cfg.NUM_GPUS > 1: train_loader.sampler.set_epoch(cur_epoch) # Update the learning rate # Currently we only support LR schedules for only 'SGD' optimizer lr = optim.get_epoch_lr(cfg, cur_epoch) if cfg.OPTIM.TYPE == "sgd": optim.set_lr(optimizer, lr) if torch.cuda.is_available(): model.cuda() # Enable training mode model.train() train_meter.iter_tic( ) #This basically notes the start time in timer class defined in utils/timer.py len_train_loader = len(train_loader) for cur_iter, (inputs, labels) in enumerate(train_loader): #ensuring that inputs are floatTensor as model weights are inputs = inputs.type(torch.cuda.FloatTensor) inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True) # Perform the forward pass preds = model(inputs) # Compute the loss loss = loss_fun(preds, labels) # Perform the backward pass optimizer.zero_grad() loss.backward() # Update the parametersSWA optimizer.step() # Compute the errors top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5]) # Combine the stats across the GPUs # if cfg.NUM_GPUS > 1: # #Average error and losses across GPUs # #Also this this calls wait method on reductions so we are ensured # #to obtain synchronized results # loss, top1_err = du.scaled_all_reduce( # [loss, top1_err] # ) # Copy the stats from GPU to CPU (sync point) loss, top1_err = loss.item(), top1_err.item() # #Only master process writes the logs which are used for plotting # if du.is_master_proc(): if True: if cur_iter != 0 and cur_iter % 19 == 0: #because cur_epoch starts with 0 plot_it_x_values.append((cur_epoch) * len_train_loader + cur_iter) plot_it_y_values.append(loss) save_plot_values( [plot_it_x_values, plot_it_y_values], ["plot_it_x_values.npy", "plot_it_y_values.npy"], out_dir=cfg.EPISODE_DIR, isDebug=False) # print(plot_it_x_values) # print(plot_it_y_values) #Plot loss graphs plot_arrays( x_vals=plot_it_x_values, y_vals=plot_it_y_values, x_name="Iterations", y_name="Loss", dataset_name=cfg.DATASET.NAME, out_dir=cfg.EPISODE_DIR, ) #Compute the difference in time now from start time initialized just before this for loop. train_meter.iter_toc() train_meter.update_stats(top1_err=top1_err, loss=loss, \ lr=lr, mb_size=inputs.size(0) * cfg.NUM_GPUS) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch) train_meter.reset() return loss, clf_iter_count
def train_epoch_pseudo(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch): """Performs one epoch of Semi-supervised training.""" # Update drop path prob for NAS if cfg.MODEL.TYPE == "nas": m = model.module if cfg.NUM_GPUS > 1 else model m.set_drop_path_prob(cfg.NAS.DROP_PROB * cur_epoch / cfg.OPTIM.MAX_EPOCH) # Shuffle the data # Update the learning rate per epoch if not cfg.OPTIM.ITER_LR: lr = optim.get_epoch_lr(cur_epoch) optim.set_lr(optimizer, lr) # Enable training mode model.train() train_meter.iter_tic() max_iter=max(len(train_loader[1]),len(train_loader[0])) loader.shuffle(train_loader[0], cur_epoch) loader.shuffle(train_loader[1], cur_epoch) label_iter = iter(train_loader[0]) unlabel_iter=iter(train_loader[1]) for cur_iter in range(max_iter): try: #print(next(label_iter)) label_im,_,labels = next(label_iter) except: loader.shuffle(train_loader[0], cur_epoch) label_iter = iter(train_loader[0]) label_im,_,labels = next(label_iter) try: unlabel_im1,unlabel_im2,_ = next(unlabel_iter) except: loader.shuffle(train_loader[1], cur_epoch) unlabel_iter = iter(train_loader[1]) unlabel_im1,unlabel_im2,_ = next(unlabel_iter) # Update the learning rate per iter if cfg.OPTIM.ITER_LR: lr = optim.get_epoch_lr(cur_epoch + cur_iter / max_iter) optim.set_lr(optimizer, lr) # Transfer the data to the current GPU device label_im, labels = label_im.cuda(), labels.cuda(non_blocking=True) unlabel_im1, unlabel_im2 = unlabel_im1.cuda(), unlabel_im2.cuda() imgs=torch.cat([label_im,unlabel_im1,unlabel_im2],dim=0) logits = model(imgs) logits_label=logits[:len(labels)] logits_unlabel1,logits_unlabel2=torch.split(logits[len(labels):],unlabel_im1.shape[0]) # with torch.no_grad(): # probs = torch.softmax(logits_label, dim=1) # scores, lbs_guess = torch.max(probs, dim=1) # print(lbs_guess,labels) loss_label=loss_fun(logits_label,labels) #print(logits.shape,logits_label.shape,logits_unlabel1.shape,logits_unlabel2.shape) with torch.no_grad(): probs = torch.softmax(logits_unlabel1, dim=1) scores, lbs_u_guess = torch.max(probs, dim=1) mask = scores.ge(cfg.TRAIN.PSD_THRESHOLD).float() criteria_u = nn.CrossEntropyLoss(reduction='none').cuda() if cfg.TASK=='psd': loss_unlabel=(criteria_u(logits_unlabel1,lbs_u_guess)*mask).mean() elif cfg.TASK=='fix': loss_unlabel=(criteria_u(logits_unlabel2,lbs_u_guess)*mask).mean() else: loss_unlabel=0 loss=loss_label+loss_unlabel optimizer.zero_grad() loss.backward() optimizer.step() # Compute the errors mb_size = label_im.size(0) * cfg.NUM_GPUS ks = [1, min(5, cfg.MODEL.NUM_CLASSES)] # rot only has 4 classes top1_err, top5_err = meters.topk_errors(logits_label, labels, ks) # Combine the stats across the GPUs (no reduction if 1 GPU used) loss=loss_label.item() loss, top1_err, top5_err = dist.scaled_all_reduce([loss, top1_err, top5_err]) top1_err, top5_err = top1_err.item(), top5_err.item() train_meter.iter_toc() # Update and log stats train_meter.update_stats(top1_err, top5_err, loss, lr, mb_size) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats train_meter.log_epoch_stats(cur_epoch) train_meter.reset()