def loss_labels(self, outputs, targets, indices, num_boxes, log=True): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] """ assert 'pred_logits' in outputs src_logits = outputs['pred_logits'] idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat( [t["labels"][J] for t, (_, J) in zip(targets, indices)]) target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device) target_classes[idx] = target_classes_o loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {'loss_ce': loss_ce} if log: # TODO this should probably be a separate loss, not hacked in this one here losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] return losses
def test(load=True, logger=None, epoch=None): if load: ckpt = torch.load(os.path.join(save_dir, 'model.tar')) net.load_state_dict(ckpt['state_dict']) if logger is None: logging.basicConfig(level=logging.INFO) logger = logging.getLogger(config) logger.addHandler( logging.FileHandler(os.path.join(save_dir, 'test.log'), mode='w')) net.eval() net.reset_dep() accm.reset() for it, (x, y) in enumerate(test_loader): x = x.cuda() y = y.cuda() outs = net(x) cent = cent_fn(outs, y) accm.update([cent.item(), accuracy(outs, y)]) logger.info(accm.info(header='test', epoch=epoch)) logger.info('reg {:.4f}'.format(net.get_reg_dep().item())) logger.info('pruned size {}'.format(str(net.get_pruned_size()))) logger.info('pruned size (dep) {}'.format(str(net.get_pruned_size_dep()))) logger.info('speedup in flops {:.4f}'.format(net.get_speedup_dep())) logger.info('memory saving {:.4f}\n'.format(net.get_memory_saving_dep()))
def train(): if not os.path.isdir(save_dir): os.makedirs(save_dir) with open(os.path.join(save_dir, 'args.txt'), 'w') as f: for v in vars(args): f.write('{}: {}\n'.format(v, getattr(args, v))) ckpt = torch.load(os.path.join(args.pretrain_dir, 'model.tar')) net.load_state_dict(ckpt['state_dict'], strict=False) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(config) logger.addHandler( logging.FileHandler(os.path.join(save_dir, 'train.log'), mode='w')) logger.info(str(args) + '\n') for epoch in range(1, args.num_epochs + 1): accm.reset() scheduler.step() line = 'epoch {} starts with lr'.format(epoch) for pg in optimizer.param_groups: line += ' {:.3e}'.format(pg['lr']) logger.info(line) net.train() if args.freeze_bn: freeze_batch_norm(net) for x, y in train_loader: x = x.cuda() y = y.cuda() optimizer.zero_grad() outs = net(x) cent = cent_fn(outs, y) reg = net.get_reg().cuda() loss = cent + args.gamma * reg loss.backward() optimizer.step() accm.update([cent.item(), accuracy(outs, y)]) line = accm.info(header='train', epoch=epoch) if epoch % args.eval_freq == 0: logger.info(line) test(load=False, logger=logger, epoch=epoch) else: logger.info(line + '\n') if epoch % args.save_freq == 0: torch.save({'state_dict': net.state_dict()}, os.path.join(save_dir, 'model.tar')) test(load=False) torch.save({'state_dict': net.state_dict()}, os.path.join(save_dir, 'model.tar'))
def test(self, epoch): batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(self.test_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to test mode self.model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(self.test_loader): images = images.cuda() target = target.cuda() # compute output output, _ = self.model(images) loss = self.criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.args.print_freq == 0 and self.args.local_rank == 0: progress.display(i) if self.args.local_rank == 0: print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) self.writer.add_scalar('Test/Avg_Loss', losses.avg, epoch + 1) self.writer.add_scalar('Test/Avg_Top1', top1.avg, epoch + 1) self.writer.add_scalar('Test/Avg_Top5', top5.avg, epoch + 1) self.summary_graph_adj(self.writer, epoch + 1) self.summary_graph_histogram(self.writer, epoch + 1) return top1.avg
def test(load=True, logger=None, epoch=None): if load: ckpt = torch.load(os.path.join(save_dir, 'model.tar')) net.load_state_dict(ckpt['state_dict']) if logger is None: logging.basicConfig(level=logging.INFO) logger = logging.getLogger(config) logger.addHandler( logging.FileHandler(os.path.join(save_dir, 'test.log'), mode='w')) net.eval() accm.reset() for it, (x, y) in enumerate(test_loader): x = x.cuda() y = y.cuda() outs = net(x) cent = cent_fn(outs, y) accm.update([cent.item(), accuracy(outs, y)]) logger.info(accm.info(header='test', epoch=epoch) + '\n')
def _infer(self, max_iters, loader): # assert max_iters == len(loader) self.model.eval() with torch.no_grad(): tot = 0 sum_loss, sum_acc1, sum_acc5 = 0., 0., 0. for inp, tar in loader: inp, tar = inp.cuda(), tar.cuda() bs = tar.shape[0] logits = self.model(inp) loss, (acc1, acc5) = F.cross_entropy(logits.data, tar), accuracy( logits.data, tar) sum_loss += loss.item() * bs sum_acc1 += acc1 * bs sum_acc5 += acc5 * bs tot += bs if self.dist_training: pass # todo: dist return sum_loss / tot, sum_acc1 / tot, sum_acc5 / tot
def train_epoch(self, epoch): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(self.train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) # switch to train mode self.model.train() end = time.time() for i, (images, target) in enumerate(self.train_loader): # measure data loading time data_time.update(time.time() - end) images = images.cuda() target = target.cuda() # compute output self.optimizer.zero_grad() logits, logits_aux = self.model(images) loss = self.criterion(logits, target) if self.args.graph_wd > 0: graph_params = [ v for k, v in self.model.named_parameters() if 'graph_weights' in k and v.requires_grad ] graph_l2 = 0 for v in graph_params: graph_l2 += (self.model.edge_act(v)**2).sum() loss += 0.5 * graph_l2 * self.args.graph_wd if self.args.auxiliary: loss_aux = self.criterion(logits_aux, target) loss += self.args.auxiliary_weight * loss_aux loss.backward() if self.args.grad_clip > 0: nn.utils.clip_grad_norm_(self.model.parameters(), self.args.grad_clip) self.optimizer.step() # measure accuracy and record loss acc1, acc5 = accuracy(logits, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) self.moving_loss = loss.item() if epoch == self.args.start_epoch and i == 0 else \ (1. - self.mu) * self.moving_loss + self.mu * loss.item() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.args.print_freq == 0 and self.args.local_rank == 0: progress.display(i) niter = epoch * len(self.train_loader) + i self.writer.add_scalar('Train/Sec_per_batch', batch_time.avg, niter) self.writer.add_scalar('Train/Avg_Loss', losses.avg, niter) self.writer.add_scalar('Train/Avg_Top1', top1.avg, niter) self.writer.add_scalar('Train/Avg_Top5', top5.avg, niter) self.writer.add_scalar('Train/Moving_Loss', self.moving_loss, niter)
np.expand_dims(samples[i][0], 0).transpose(0, 3, 1, 2)), mx.nd.array( np.expand_dims(samples[i][1], 0).transpose(0, 3, 1, 2)) ], label=[mx.nd.array([np.expand_dims(samples[i][2], 0)])]) mod.forward_backward(dbatch) mod.update() # Save checkpoint and result mod.save_checkpoint(prefix=checkpoint_path, epoch=k, save_optimizer_states=True) np.save(checkpoint_path + '_predict_{}.npy'.format(k), label) # Evaluation score, density = misc.F1_score(label, gt) acc = misc.accuracy(label, gt) logging.info( "Epoch : %d, F1-score : %.4f, accuracy: %.4f, Density : %.4f" % (k, score, acc, density)) if flag_n == False: args.t1 += 0.05 logging.info("update t1:{}".format(args.t1)) if flag_p == False: args.t2 -= 0.01 logging.info("update t2:{}".format(args.t2))
def _train_with_aug(self, max_iters, loader, max_ep, op_cfg, sc_cfg, sync_mid, lsmooth, save_mode='best', prefix='pre'): # assert max_iters == len(loader) self.model.train() max_it = max_iters max_global_it = max_ep * max_it train_log_freq = max_it // 10 test_freqs = [self.test_freq * 32, self.test_freq] speed = AverageMeter(max_it) tr_loss, tr_acc1, tr_acc5 = AverageMeter(train_log_freq), AverageMeter( train_log_freq), AverageMeter(train_log_freq) op, sc = self.create_op_sc(self.model, op_cfg, sc_cfg, iters_per_epoch=max_it) op: Optimizer sc: LRScheduler best_acc1 = 0 start_train_t = time.time() crit = self.criterion if lsmooth else F.cross_entropy for ep in range(max_ep): ep_str = f'%{len(str(max_ep))}d' % (ep + 1) is_late = int(ep >= 0.75 * max_ep) test_freq = test_freqs[is_late] if ep % 32 == 0: self.lg.info(f'==> at {self.exp_root}') last_t = time.time() for it, tup in enumerate(loader): if len(tup) == 3: inp, tar, _ = tup else: inp, tar = tup it_str = f'%{len(str(max_it))}d' % (it + 1) global_it = ep * max_it + it data_t = time.time() if global_it == 1: for i in range(self.dist.world_size): if self.dist.rank == i: print(f'rk[{i:2d}] dist test') self.dist.barrier() inp, tar = inp.cuda(), tar.cuda() cuda_t = time.time() logits = self.model(inp) loss = crit(logits, tar) tr_loss.update(loss.item()) op.zero_grad() loss.backward() if self.dist_training: pass if self.model_grad_clip is not None: total_norm = torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.model_grad_clip) else: total_norm = -233 clipped_norm = torch.cat([ p.grad.data.view(-1) for p in self.model.parameters() ]).abs_().norm() sc.step() # sc.step() before op.step() lr = sc.get_lr()[0] clipped_lr = lr * (clipped_norm / total_norm) op.step() acc1, acc5 = accuracy(logits, tar) op_t = time.time() total_r = tar.shape[0] / 128 tr_acc1.update(val=acc1, num=total_r) tr_acc5.update(val=acc5, num=total_r) if global_it % test_freq == 0 or global_it == max_global_it - 1: test_loss, test_acc1, test_acc5 = self.test() test_t = time.time() self.model.train() is_best = test_acc1 >= best_acc1 best_acc1 = max(test_acc1, best_acc1) if self.dist.is_master() and it + 1 == max_it: remain_time, finish_time = speed.time_preds( max_global_it - global_it - 1) self.lg.info( f'ep[{ep_str}/{max_ep}], it[{it_str}/{max_it}]:' f' tr-err1[{100-tr_acc1.last:5.2f}] ({100-tr_acc1.avg:5.2f}),' f' tr-loss[{tr_loss.last:.4f}] ({tr_loss.avg:.4f}),' f' te-err1[{100-test_acc1:5.2f}],' f' te-loss[{test_loss:.4f}],\n' f' data[{data_t-last_t:.3f}],' f' cuda[{cuda_t-data_t:.3f}],' f' bp[{op_t-cuda_t:.3f}],' f' te[{test_t-op_t:.3f}]' f' rem-t[{remain_time}] ({finish_time})' f' lr[{lr:.4g}] ({clipped_lr:.4g})') state = { 'model': self.model.state_dict(), 'op': op.state_dict(), 'last_iter': global_it, } model_ckpt_path = os.path.join( self.ckpt_root, f'rk{self.dist.rank}_{prefix}_{save_mode}.pth.tar') if save_mode == 'best' and is_best: self.lg.info( f'==> saving best model ckpt (err{100-test_acc1:.3f}) at {os.path.abspath(model_ckpt_path)}...' ) torch.save(state, model_ckpt_path) elif save_mode == 'last': torch.save(state, model_ckpt_path) speed.update(time.time() - last_t) last_t = time.time() if self.dist.world_size > 1: test_loss, test_acc1, test_acc5 = self.test() acc1_ts: torch.Tensor = sync_vals(self.dist, test_acc1, None) mid_rank = acc1_ts.argsort()[self.dist.world_size // 2].item() mid_ckpt_path = os.path.join( self.ckpt_root, f'midrk{mid_rank}_{prefix}_enderr{100-acc1_ts[mid_rank].item():.2f}.pth.tar' ) if self.dist.rank == mid_rank: torch.save( { 'model': self.model.state_dict(), 'op': op.state_dict(), }, mid_ckpt_path) self.dist.barrier() if sync_mid: mid_ckpt = torch.load(mid_ckpt_path, map_location='cpu') self.model.load_state_dict(mid_ckpt['model']) op.load_state_dict(mid_ckpt['op']) best_errs: torch.Tensor = sync_vals(self.dist, 100 - best_acc1, None) best_err: float = best_errs.mean().item() self.lg.info( f'==> {prefix}-training finished, mid rank={mid_rank},' f' total time cost: {(time.time()-start_train_t)/60:.2f} min,' f' test err @1: mean={best_err:.3f}') else: best_err = 100 - best_acc1 self.lg.info( f'==> {prefix}-training finished,' f' total time cost: {(time.time()-start_train_t)/60:.2f} min,' f' test err @1: {100-best_acc1:.3f}') [ self.meta_tb_lg.add_scalar(f'{prefix}_best_err', best_err, t) for t in [0, max_ep] ] [ self.g_tb_lg.add_scalar(f'{prefix}_best_err', best_err, t) for t in [0, max_ep] ] return { 'model': self.model.state_dict(), 'op': op.state_dict(), 'last_iter': max_global_it }