def validate(config, testloader, model, writer_dict): model.eval() ave_loss = AverageMeter() nums = config.MODEL.NUM_OUTPUTS confusion_matrix = np.zeros( (config.DATASET.NUM_CLASSES, config.DATASET.NUM_CLASSES, nums)) with torch.no_grad(): for idx, batch in enumerate(testloader): image, label, _, _ = batch size = label.size() image = image.cuda() label = label.long().cuda() losses, pred = model(image, label) if not isinstance(pred, (list, tuple)): pred = [pred] for i, x in enumerate(pred): x = F.interpolate( input=x, size=size[-2:], mode='bilinear', align_corners=config.MODEL.ALIGN_CORNERS ) confusion_matrix[..., i] += get_confusion_matrix( label, x, size, config.DATASET.NUM_CLASSES, config.TRAIN.IGNORE_LABEL ) if idx % 10 == 0: print(idx) loss = losses.mean() if dist.is_distributed(): reduced_loss = reduce_tensor(loss) else: reduced_loss = loss ave_loss.update(reduced_loss.item()) if dist.is_distributed(): confusion_matrix = torch.from_numpy(confusion_matrix).cuda() reduced_confusion_matrix = reduce_tensor(confusion_matrix) confusion_matrix = reduced_confusion_matrix.cpu().numpy() for i in range(nums): pos = confusion_matrix[..., i].sum(1) res = confusion_matrix[..., i].sum(0) tp = np.diag(confusion_matrix[..., i]) IoU_array = (tp / np.maximum(1.0, pos + res - tp)) mean_IoU = IoU_array.mean() if dist.get_rank() <= 0: logging.info('{} {} {}'.format(i, IoU_array, mean_IoU)) writer = writer_dict['writer'] global_steps = writer_dict['valid_global_steps'] writer.add_scalar('valid_loss', ave_loss.average(), global_steps) writer.add_scalar('valid_mIoU', mean_IoU, global_steps) writer_dict['valid_global_steps'] = global_steps + 1 return ave_loss.average(), mean_IoU, IoU_array
def get_sampler(dataset): from utils.distributed import is_distributed if is_distributed(): from torch.utils.data.distributed import DistributedSampler return DistributedSampler(dataset) else: return None
def train(config, epoch, num_epoch, epoch_iters, base_lr, num_iters, trainloader, optimizer, model, writer_dict): # Training model.train() scaler = GradScaler() batch_time = AverageMeter() ave_loss = AverageMeter() ave_acc = AverageMeter() tic = time.time() cur_iters = epoch * epoch_iters writer = writer_dict['writer'] global_steps = writer_dict['train_global_steps'] for i_iter, batch in enumerate(trainloader, 0): images, labels, _, _ = batch images = images.cuda() # print("images:",images.size()) labels = labels.long().cuda() # print("label:",labels.size()) with autocast(): losses, _, acc = model(images, labels) loss = losses.mean() acc = acc.mean() if dist.is_distributed(): reduced_loss = reduce_tensor(loss) else: reduced_loss = loss model.zero_grad() scaler.scale(loss).backward() # loss.backward() #optimizer.step() scaler.step(optimizer) scaler.update() # measure elapsed time batch_time.update(time.time() - tic) tic = time.time() # update average loss ave_loss.update(reduced_loss.item()) ave_acc.update(acc.item()) lr = adjust_learning_rate(optimizer, base_lr, num_iters, i_iter + cur_iters) if i_iter % config.PRINT_FREQ == 0 and dist.get_rank() == 0: msg = 'Epoch: [{}/{}] Iter:[{}/{}], Time: {:.2f}, ' \ 'lr: {}, Loss: {:.6f}, Acc:{:.6f}' .format( epoch, num_epoch, i_iter, epoch_iters, batch_time.average(), [x['lr'] for x in optimizer.param_groups], ave_loss.average(), ave_acc.average()) logging.info(msg) writer.add_scalar('train_loss', ave_loss.average(), global_steps) writer_dict['train_global_steps'] = global_steps + 1
def __init__(self, opt: Opt, shared=None): # Must call _get_init_model() first so that paths are updated if necessary # (e.g., a .dict file) init_model, is_finetune = self._get_init_model(opt, shared) opt['rank_candidates'] = True self._set_candidate_variables(opt) super().__init__(opt, shared) states: Dict[str, Any] if shared: states = {} else: # Note: we cannot change the type of metrics ahead of time, so you # should correctly initialize to floats or ints here self.criterion = self.build_criterion() self.model = self.build_model() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model ' 'or criterion') train_params = trainable_parameters(self.model) total_params = total_parameters(self.model) print( f"Total parameters: {total_params:,d} ({train_params:,d} trainable)" ) if self.fp16: self.model = self.model.half() if init_model: print('Loading existing model parameters from ' + init_model) states = self.load(init_model) else: states = {} if self.use_cuda: if self.model_parallel: self.model = PipelineHelper().make_parallel(self.model) else: self.model.cuda() if self.data_parallel: self.model = torch.nn.DataParallel(self.model) self.criterion.cuda() self.rank_top_k = opt.get('rank_top_k', -1) # Set fixed and vocab candidates if applicable self.set_fixed_candidates(shared) self.set_vocab_candidates(shared) if shared: # We don't use get here because hasattr is used on optimizer later. if 'optimizer' in shared: self.optimizer = shared['optimizer'] elif self._should_initialize_optimizer(): # only build an optimizer if we're training optim_params = [ p for p in self.model.parameters() if p.requires_grad ] self.init_optim(optim_params, states.get('optimizer'), states.get('optimizer_type')) self.build_lr_scheduler(states, hard_reset=is_finetune) if shared is None and is_distributed(): device_ids = None if self.model_parallel else [self.opt['gpu']] self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=device_ids, broadcast_buffers=False)