def train(self, epoch, print_step=100): msglogger.info("Epoch: {}".format(epoch)) self.model.train() classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(self.trainloader): inputs, targets = inputs.to(self.device), targets.to(self.device) self.optimizer.zero_grad() outputs = self.model(inputs) loss = self.criterion(outputs, targets) loss.backward() self.optimizer.step() train_loss += loss.item() # _, predicted = outputs.max(1) # total += targets.size(0) # correct += predicted.eq(targets).sum().item() classerr.add(outputs.detach(), targets) if ((batch_idx + 1) % print_step) == 0: msglogger.info( '[%d / %d] ==> Top1: %.3f Top5: %.3f Loss: %.3f\n', batch_idx + 1, len(self.trainloader), classerr.value()[0], classerr.value()[1], train_loss / (batch_idx + 1))
def test(model, criterion): dump_act = 2 correct = 0 total = 0 classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) losses = {'objective_loss': tnt.AverageValueMeter()} with torch.no_grad(): for batch_idx, (images, labels) in enumerate(testloader): if(dump_act == None or (dump_act != None and batch_idx == dump_act)): images, labels = images.cuda(), labels.cuda() # dump_to_npy(name= 'input.activation.int8.'+str(batch_idx), tensor=images) outputs = model(images) classerr.add(outputs.data, labels) loss = criterion(outputs, labels) losses['objective_loss'].add(loss.item()) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() if(total % 1000 == 0): print('[{0}] accuracy {1}%'.format(total, str(correct/total*100))) acc = correct / total print('Accuracy of the network on the 10000 test images: %d %%' % (100 * acc)) top1, top5 = classerr.value()[0], classerr.value()[1] print("Top1 = %.3f, Top5 = %.3f, loss = %.3f\n"%(top1, top5, losses["objective_loss"].mean)) return top1, top5, losses['objective_loss'].mean
def run_epoch(stage, state, data_loader): """stage = 'train' or 'test' or 'val' or anything""" if stage=='train': state.model.train() else: state.model.eval() pbar = tqdm(total=len(data_loader), leave=False) _loss = meter.AverageValueMeter() _acc = meter.ClassErrorMeter(accuracy=True) _conf = meter.ConfusionMeter(k=10, normalized=True) for batch_idx, (data, target) in enumerate(data_loader): data, target = data.to(state.args.device), target.to(state.args.device) if stage=='train': state.optimizer.zero_grad() output = state.model(data) loss = F.nll_loss(output, target) if stage=='train': loss.backward() state.optimizer.step() state.writer.add_scalar(stage+'/loss-iter', loss.mean(), (batch_idx + state.epoch*len(data_loader)) ) # * data.size()[0] ) _loss.add(loss.mean().item()) _acc.add(output, target) _conf.add(output, target) if batch_idx % state.args.pbar_interval == 0: pbar.desc = '{:6s}'.format(stage) pbar.postfix = 'Loss {:.4f} Acc {:.4f}%'.format(_loss.value(), _acc.value()) pbar.update(state.args.pbar_interval) if stage=='train': state.scheduler.step() pbar.close() # if stage != 'train' or 'train_test' not in stage: state.epoch_pbar.desc += ' {:6s}: loss {:.4f}, Acc {:.4f}% |'.format(stage, _loss.value(), _acc.value()) state.epoch_pbar.update() # if stage!='train': state.writer.add_scalar(stage+'/avg_loss-epoch', _loss.value(), state.epoch) state.writer.add_scalar(stage+'/avg_acc-epoch', _acc.value(), state.epoch) state.writer.add_heatmap(stage+'/conf_matrix-epoch', _conf.value(), state.epoch, y_title=data_loader.dataset.classes, x_title=data_loader.dataset.classes ) result = { 'loss' : _loss.value(), 'acc': _acc.value() } return result
def testClassErrorMeteri_batch1(self): mtr = meter.ClassErrorMeter(topk=[1]) output = torch.tensor([1, 0, 0]) if hasattr(torch, "arange"): target = torch.arange(0, 1) else: target = torch.range(0, 0) mtr.add(output, target) err = mtr.value() self.assertEqual(err, [0], "All should be correct")
def train(train_loader, model, criterion, optimizer, epoch, loggers, args): losses = OrderedDict([('Overall Loss', meter.AverageValueMeter()), ('Objective Loss', meter.AverageValueMeter())]) classerr = meter.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = meter.AverageValueMeter() data_time = meter.AverageValueMeter() total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info("{} samples ({} per mini-batch)".format( total_samples, batch_size)) model.train() acc_stats = [] end = time.time() for train_step, (inputs, target) in enumerate(train_loader): data_time.add(time.time() - end) inputs, target = inputs.to(args.device), target.to(args.device) output = model(inputs) loss = criterion(output, target) classerr.add(output.data, target) acc_stats.append([classerr.value(1), classerr.value(5)]) losses['Objective Loss'].add(loss.item()) losses['Overall Loss'].add(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.add(time.time() - end) steps_completed = train_step + 1 if steps_completed % args.print_freq == 0: errs = OrderedDict() errs['Top1'] = classerr.value(1) errs['Top5'] = classerr.value(5) stats_dict = OrderedDict() for loss_name, loss_value in losses.items(): stats_dict[loss_name] = loss_value.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Performance/Training/', stats_dict) msglogger.info( 'Train epoch: %d [%5d/%5d] Top1: %.3f Top5: %.3f Loss: %.3f', epoch, steps_completed, steps_per_epoch, errs['Top1'], errs['Top5'], losses['Objective Loss'].mean) end = time.time() return acc_stats
def _validate(data_loader, model, criterion, loggers, print_freq, epoch=-1): """Execute the validation/test loop.""" losses = {'objective_loss': tnt.AverageValueMeter()} classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() # if nclasses<=10: # # Log the confusion matrix only if the number of classes is small # confusion = tnt.ConfusionMeter(10) total_samples = len(data_loader.sampler) batch_size = data_loader.batch_size total_steps = total_samples / batch_size msglogger.info('%d samples (%d per mini-batch)', total_samples, batch_size) # Switch to evaluation mode model.eval() end = time.time() for validation_step, (inputs, target) in enumerate(data_loader): with PytorchNoGrad(): target = target.cuda(async=True) input_var = get_inference_var(inputs) target_var = get_inference_var(target) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss losses['objective_loss'].add(loss.item()) classerr.add(output.data, target) # if confusion: # confusion.add(output.data, target) # measure elapsed time batch_time.add(time.time() - end) end = time.time() steps_completed = (validation_step + 1) if steps_completed % print_freq == 0: stats = ('', OrderedDict([('Loss', losses['objective_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5))])) distiller.log_training_progress(stats, None, epoch, steps_completed, total_steps, print_freq, loggers) msglogger.info('==> Top1: %.3f Top5: %.3f Loss: %.3f\n', classerr.value()[0], classerr.value()[1], losses['objective_loss'].mean) # if confusion: # msglogger.info('==> Confusion:\n%s', str(confusion.value())) return classerr.value(1), classerr.value(5), losses['objective_loss'].mean
def test(model, dataloader, num_workers, batch_size, resultpath): print("num test = {}".format(len(dataloader.dataset))) """ 测试指标: 1、 准确率(Accuracy): 模型预测正确样本数占总样本数的比例。test_acc 2、 各个类的精度: 模型对各个类别的预测准确率。 3、 AUC 4、 混淆矩阵: 用于计算各种指标(包括灵敏性,特异性等) """ # 整个测试数据集的准确率 test_acc = meter.ClassErrorMeter(topk=[1], accuracy=True) # 每一类的精度 test_ap = meter.APMeter() # AUC指标,AUC要求输入样本预测为正例的概率 """根据我的数据集文件命名,0表示阴性,1表示阳性(即1表示正例)""" test_auc = meter.AUCMeter() # 混淆矩阵 test_conf = meter.ConfusionMeter(k=2, normalized=False) result_writer = ResultsWriter(str(resultpath), overwrite=False) with torch.no_grad(): for inputs, labels in tqdm(dataloader, desc="Test"): # inputs[B,C,H,W] inputs = inputs.cuda() if torch.cuda.is_available() else inputs # labes[B,numclasses] labels = labels.cuda() if torch.cuda.is_available() else labels # outputs[B,numclasses] outputs = model(inputs) # 计算指标 pred_proc = F.softmax(outputs.detach(), dim=1) test_acc.add(pred_proc, labels.detach()) test_ap.add(pred_proc, labels.detach()) # 取出output第1列的数,正例即1(患病)的概率 test.auc.add(pred_proc[:1], labels.detach()) test_conf.add(pred_proc, labels.detach()) # 记录保存, 便于evaluate.py计算和画图一些结果 result_writer.update( "test", { "acc": test_acc.value(), "ap": test_ap.value(), "test_auc": test_auc.value()[0], "test_tpr": test_auc.value()[1], "test_fpr": test_auc.value()[2], "test_conf": test_conf.value() }) return test_acc, test_ap, test_auc
def __init__(self, name=None, n_classes=2): self.name = name self.n_classes = n_classes self.path = os.path.join('log', name) self.conf_mtr = meter.ConfusionMeter(n_classes) self.auc_mtr = meter.AUCMeter() self.err_mtr = meter.ClassErrorMeter(topk=[1], accuracy=True) saveMkdir(self.path) self.fp = open(os.path.join(self.path, 'res.log'), 'w') self.y_scores = np.array([], dtype=np.float32).reshape(0, 1) self.y_true = np.array([], dtype=np.float32).reshape(0, 1)
def testClassErrorMeter(self): mtr = meter.ClassErrorMeter(topk=[1]) output = torch.eye(3) target = torch.range(0, 2) mtr.add(output, target) err = mtr.value() self.assertEqual(err, [0], "All should be correct") target[0] = 1 target[1] = 0 target[2] = 0 mtr.add(output, target) err = mtr.value() self.assertEqual(err, [50.0], "Half should be correct")
def _validate(data_loader, model, criterion, loggers, args, epoch=-1): losses = {'objective_loss': meter.AverageValueMeter()} classerr = meter.ClassErrorMeter(accuracy=True, topk=(1, 5)) print(type(meter)) if args.earlyexit_thresholds: raise ValueError('Error: earlyexit function has not been completed') batch_time = meter.AverageValueMeter() total_samples = len(data_loader.sampler) batch_size = data_loader.batch_size total_steps = total_samples / batch_size msglogger.info("{} samples ({} per mini-batch)".format( total_samples, batch_size)) model.eval() end = time.time() for validation_step, (inputs, target) in enumerate(data_loader): with torch.no_grad(): inputs, target = inputs.to(args.device), target.to(args.device) output = model(inputs) loss = criterion(output, target) losses['objective_loss'].add(loss.item()) classerr.add(output.data, target) batch_time.add(time.time() - end) end = time.time() steps_completed = validation_step + 1 if steps_completed % args.print_freq == 0: if not args.earlyexit_thresholds: stats = ('', OrderedDict([('Loss', losses['objective_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5))])) msglogger.info("Validation epoch: %d [%d/%d]", epoch, validation_step, total_steps) else: pass if not args.earlyexit_thresholds: msglogger.info( '==> Validation epoch: %d Top1: %.3f Top5: %.3f Loss: %.3f', epoch, classerr.value()[0], classerr.value()[1], losses['objective_loss'].mean) return classerr.value(1), classerr.value( 5), losses['objective_loss'].mean
def validate(model, criterion, data_loader, args): classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) model.eval() if args.cpu == True: model = model.cpu() for validation_step, (inputs, target) in enumerate(data_loader): with torch.no_grad(): if args.cpu == True: inputs, target = inputs.cpu(), target.cpu() else: inputs, target = inputs.to('cuda'), target.to('cuda') output = model(inputs) classerr.add(output.data, target) return classerr.value(1)
def _validate(data_loader, model, criterion, loggers, args, epoch=-1): """Execute the validation/test loop.""" losses = {'objective_loss': tnt.AverageValueMeter()} classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() total_samples = len(data_loader.sampler) batch_size = data_loader.batch_size if args.display_confusion: confusion = tnt.ConfusionMeter(args.num_classes) total_steps = total_samples / batch_size msglogger.info('%d samples (%d per mini-batch)', total_samples, batch_size) # Switch to evaluation mode model.eval() end = time.time() for validation_step, (inputs, target) in enumerate(data_loader): with torch.no_grad(): inputs, target = inputs.to(args.device), target.to(args.device) # compute output from model output = model(inputs) # compute loss loss = criterion(output, target) # measure accuracy and record loss losses['objective_loss'].add(loss.item()) classerr.add(output.data, target) if args.display_confusion: confusion.add(output.data, target) # measure elapsed time batch_time.add(time.time() - end) end = time.time() steps_completed = (validation_step + 1) if steps_completed % args.print_freq == 0: stats = ('', OrderedDict([('Loss', losses['objective_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5))])) distiller.log_training_progress(stats, None, epoch, steps_completed, total_steps, args.print_freq, loggers) if args.display_confusion: msglogger.info('==> Confusion:\n%s\n', str(confusion.value())) return classerr.value(1), classerr.value(5), losses['objective_loss'].mean
def test(self, epoch, print_step=100): self.model.eval() classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(self.testloader): inputs, targets = inputs.to(self.device), targets.to( self.device) outputs = self.model(inputs) loss = self.criterion(outputs, targets) test_loss += loss.item() classerr.add(outputs.detach(), targets) if ((batch_idx + 1) % print_step) == 0: msglogger.info( '[%d / %d] ==> Top1: %.3f Top5: %.3f Loss: %.3f\n', batch_idx + 1, len(self.testloader), classerr.value()[0], classerr.value()[1], test_loss / (batch_idx + 1)) # Save checkpoint. acc = classerr.value()[0] save_path = './logs/' + self.log_time + '/checkpoint/ckpt.pth' if acc > self.best_acc: save_path = './logs/' + self.log_time + '/checkpoint/best.pth' self.best_acc = acc print('Saving..') state = { 'net': self.model.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('./logs/' + self.log_time + '/checkpoint'): os.mkdir('./logs/' + self.log_time + '/checkpoint') torch.save(state, save_path) return acc
def val(model, dataloader, criterion): model.eval() if opt.gpus <= 1 else model.module.eval() loss_meter = meter.AverageValueMeter() accuracy_meter = meter.ClassErrorMeter(accuracy=True) for ii, data in enumerate(dataloader): input_, label = data input_, label = input_.to(device), label.to(device) score = model(input_) accuracy_meter.add(score.data.squeeze(), label.long()) loss = criterion(score, label) loss_meter.add(loss.cpu().data) for (i, num) in enumerate(model.get_activated_neurons() if opt.gpus <= 1 else model.module.get_activated_neurons()): vis.plot("val_layer/{}".format(i), num) for (i, z_phi) in enumerate(model.z_phis()): if opt.hardsigmoid: vis.hist("hard_sigmoid(phi)/{}".format(i), F.hardtanh(opt.k * z_phi / 7. + .5, 0, 1).cpu().detach().numpy()) else: vis.hist("sigmoid(phi)/{}".format(i), torch.sigmoid(opt.k * z_phi).cpu().detach().numpy()) vis.plot("prune_rate", model.prune_rate() if opt.gpus <= 1 else model.module.prune_rate()) return accuracy_meter.value()[0], loss_meter.value()[0]
def _validate(data_loader, model, criterion, loggers, args, epoch=-1): """Execute the validation/test loop.""" losses = {'objective_loss': tnt.AverageValueMeter()} classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) if args.earlyexit_thresholds: # for Early Exit, we have a list of errors and losses for each of the exits. args.exiterrors = [] args.losses_exits = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) args.losses_exits.append(tnt.AverageValueMeter()) args.exit_taken = [0] * args.num_exits batch_time = tnt.AverageValueMeter() total_samples = len(data_loader.sampler) batch_size = data_loader.batch_size if args.display_confusion: confusion = tnt.ConfusionMeter(args.num_classes) total_steps = total_samples / batch_size msglogger.info('%d samples (%d per mini-batch)', total_samples, batch_size) # Switch to evaluation mode model.eval() end = time.time() for validation_step, (inputs, target) in enumerate(data_loader): with torch.no_grad(): inputs, target = inputs.to(args.device), target.to(args.device) # compute output from model output = model(inputs) if not args.earlyexit_thresholds: # compute loss loss = criterion(output, target) # measure accuracy and record loss losses['objective_loss'].add(loss.item()) classerr.add(output.data, target) if args.display_confusion: confusion.add(output.data, target) else: earlyexit_validate_loss(output, target, criterion, args) # measure elapsed time batch_time.add(time.time() - end) end = time.time() steps_completed = (validation_step+1) if steps_completed % args.print_freq == 0: if not args.earlyexit_thresholds: stats = ('', OrderedDict([('Loss', losses['objective_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5))])) else: stats_dict = OrderedDict() stats_dict['Test'] = validation_step for exitnum in range(args.num_exits): la_string = 'LossAvg' + str(exitnum) stats_dict[la_string] = args.losses_exits[exitnum].mean # Because of the nature of ClassErrorMeter, if an exit is never taken during the batch, # then accessing the value(k) will cause a divide by zero. So we'll build the OrderedDict # accordingly and we will not print for an exit error when that exit is never taken. if args.exit_taken[exitnum]: t1 = 'Top1_exit' + str(exitnum) t5 = 'Top5_exit' + str(exitnum) stats_dict[t1] = args.exiterrors[exitnum].value(1) stats_dict[t5] = args.exiterrors[exitnum].value(5) stats = ('Performance/Validation/', stats_dict) distiller.log_training_progress(stats, None, epoch, steps_completed, total_steps, args.print_freq, loggers) if not args.earlyexit_thresholds: msglogger.info('==> Top1: %.3f Top5: %.3f Loss: %.3f\n', classerr.value()[0], classerr.value()[1], losses['objective_loss'].mean) if args.display_confusion: msglogger.info('==> Confusion:\n%s\n', str(confusion.value())) return classerr.value(1), classerr.value(5), losses['objective_loss'].mean else: total_top1, total_top5, losses_exits_stats = earlyexit_validate_stats(args) return total_top1, total_top5, losses_exits_stats[args.num_exits-1]
def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, vis, trainlogwindow): print('train at epoch {}'.format(epoch)) model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() mmap = meter.mAPMeter() top = meter.ClassErrorMeter(topk=[1, 3, 5], accuracy=True) mmap.reset() top.reset() end_time = time.time() for i, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) targets = targets.cuda() if type(inputs) is list: inputs = [Variable(inputs[ii]).cuda() for ii in range(len(inputs))] else: inputs = inputs.cuda() #inputs, targets_a, targets_b, lam = mixup_data(inputs, targets, opt.DATASET.ALPHA, True) #inputs, targets_a, targets_b = Variable(inputs), Variable(targets_a), Variable(targets_b) inputs = Variable(inputs) #print(targets) targets = Variable(targets) outputs, context = model(inputs) #loss_func = mixup_criterion(targets_a, targets_b, lam) #loss = loss_func(criterion, outputs) loss = criterion(outputs, targets) #print(outputs.shape) #print(targets) acc = calculate_accuracy(outputs, targets) one_hot = torch.zeros_like(outputs).cuda().scatter_( 1, targets.view(-1, 1), 1) mmap.add(outputs.detach(), one_hot.detach()) top.add(outputs.detach(), targets.detach()) losses.update(loss.data.item(), targets.detach().size(0)) accuracies.update(acc, targets.detach().size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end_time) end_time = time.time() vis.text( "gpu{}, epoch: {},batch:{},iter: {},loss: {},acc:{},lr: {}\n".format(torch.cuda.current_device(),epoch, i + 1,(epoch - 1) * len(data_loader) + (i + 1),losses.val, \ accuracies.val,optimizer.param_groups[0]['lr']) ,win=trainlogwindow,append=True) print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})\t' 'mmap {mmap}\t' 'top1 3 5: {top}\t'.format(epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies, mmap=mmap.value(), top=top.value())) vis.text( "total:\n gpu:{} epoch: {},loss: {},lr: {}, accu:{},mAP:{}, top135 {}\n" .format(torch.cuda.current_device(), epoch, losses.avg, optimizer.param_groups[0]['lr'], accuracies.avg, mmap.value(), top.value()), win=trainlogwindow, append=True) if torch.cuda.current_device() == 0: print("saveing ckp ########################################") if epoch % opt.MODEL.CKP_DURING == 0: save_file_path = os.path.join(opt.MODEL.RESULT, opt.MODEL.NAME, 'save_{}.pth'.format(epoch)) if not os.path.exists( os.path.join(opt.MODEL.RESULT, opt.MODEL.NAME)): os.makedirs(os.path.join(opt.MODEL.RESULT, opt.MODEL.NAME)) states = { 'epoch': epoch + 1, 'arch': opt.MODEL.NAME, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_file_path) return losses.avg, mmap.value()
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training loop for one epoch.""" losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() # For Early Exit, we define statistics for each exit # So exiterrors is analogous to classerr for the non-Early Exit case if args.earlyexit_lossweights: args.exiterrors = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() acc_stats = [] end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) inputs, target = inputs.to(args.device), target.to(args.device) # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) if not hasattr(args, 'kd_policy') or args.kd_policy is None: output = model(inputs) else: output = args.kd_policy.forward(inputs) if not args.earlyexit_lossweights: loss = criterion(output, target) # Measure accuracy classerr.add(output.data, target) acc_stats.append([classerr.value(1), classerr.value(5)]) else: # Measure accuracy and record loss loss = earlyexit_loss(output, target, criterion, args) # Record loss losses[OBJECTIVE_LOSS_KEY].add(loss.item()) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) else: losses[OVERALL_LOSS_KEY].add(loss.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() if compression_scheduler: compression_scheduler.before_parameter_optimization(epoch, train_step, steps_per_epoch, optimizer) optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % args.print_freq == 0: # Log some statistics errs = OrderedDict() if not args.earlyexit_lossweights: errs['Top1'] = classerr.value(1) errs['Top5'] = classerr.value(5) else: # for Early Exit case, the Top1 and Top5 stats are computed for each exit. for exitnum in range(args.num_exits): errs['Top1_exit' + str(exitnum)] = args.exiterrors[exitnum].value(1) errs['Top5_exit' + str(exitnum)] = args.exiterrors[exitnum].value(5) stats_dict = OrderedDict() for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Performance/Training/', stats_dict) params = model.named_parameters() if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) end = time.time() return acc_stats
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training-with-compression loop for one epoch. For each training step in epoch: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() compression_scheduler.before_parameter_optimization(epoch) optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ def _log_training_progress(): # Log some statistics errs = OrderedDict() if not early_exit_mode(args): errs['Top1'] = classerr.value(1) errs['Top5'] = classerr.value(5) else: # For Early Exit case, the Top1 and Top5 stats are computed for each exit. for exitnum in range(args.num_exits): errs['Top1_exit' + str(exitnum)] = args.exiterrors[exitnum].value(1) errs['Top5_exit' + str(exitnum)] = args.exiterrors[exitnum].value(5) stats_dict = OrderedDict() for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Performance/Training/', stats_dict) params = model.named_parameters() if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) OVERALL_LOSS_KEY = 'Overall Loss' OBJECTIVE_LOSS_KEY = 'Objective Loss' losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() # For Early Exit, we define statistics for each exit, so # `exiterrors` is analogous to `classerr` in the non-Early Exit case if early_exit_mode(args): args.exiterrors = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() acc_stats = [] end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) inputs, target = inputs.to(args.device), target.to(args.device) # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) if not hasattr(args, 'kd_policy') or args.kd_policy is None: output = model(inputs) else: output = args.kd_policy.forward(inputs) if not early_exit_mode(args): # Handle loss calculation for inception models separately due to auxiliary outputs # if user turned off auxiliary classifiers by hand, then loss should be calculated normally, # so, we have this check to ensure we only call this function when output is a tuple if models.is_inception(args.arch) and isinstance(output, tuple): loss = inception_training_loss(output, target, criterion, args) else: loss = criterion(output, target) # Measure accuracy # For inception models, we only consider accuracy of main classifier if isinstance(output, tuple): classerr.add(output[0].detach(), target) else: classerr.add(output.detach(), target) acc_stats.append([classerr.value(1), classerr.value(5)]) else: # Measure accuracy and record loss classerr.add(output[args.num_exits-1].detach(), target) # add the last exit (original exit) loss = earlyexit_loss(output, target, criterion, args) # Record loss losses[OBJECTIVE_LOSS_KEY].add(loss.item()) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) else: losses[OVERALL_LOSS_KEY].add(loss.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() if compression_scheduler: compression_scheduler.before_parameter_optimization(epoch, train_step, steps_per_epoch, optimizer) optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % args.print_freq == 0: _log_training_progress() end = time.time() #return acc_stats # NOTE: this breaks previous behavior, which returned a history of (top1, top5) values return classerr.value(1), classerr.value(5), losses[OVERALL_LOSS_KEY]
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, print_freq, log_params_hist): """Training loop for one epoch.""" losses = { 'objective_loss': tnt.AverageValueMeter(), 'regularizer_loss': tnt.AverageValueMeter() } if compression_scheduler is None: # Initialize the regularizer loss to zero losses['regularizer_loss'].add(0) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(inputs) target_var = torch.autograd.Variable(target) # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch) output = model(input_var) loss = criterion(output, target_var) # Measure accuracy and record loss classerr.add(output.data, target) losses['objective_loss'].add(loss.item()) if compression_scheduler: # Before running the backward phase, we add any regularization loss computed by the scheduler regularizer_loss = compression_scheduler.before_backward_pass( epoch, train_step, steps_per_epoch, loss) loss += regularizer_loss losses['regularizer_loss'].add(regularizer_loss.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step + 1) if steps_completed % print_freq == 0: # Log some statistics lr = optimizer.param_groups[0]['lr'] stats = ('Peformance/Training/', OrderedDict([('Loss', losses['objective_loss'].mean), ('Reg Loss', losses['regularizer_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5)), ('LR', lr), ('Time', batch_time.mean)])) distiller.log_training_progress( stats, model.named_parameters() if log_params_hist else None, epoch, steps_completed, steps_per_epoch, print_freq, loggers) end = time.time()
def val_epoch(epoch, data_loader, model, criterion, opt, vis,vallogwindow): print('validation at epoch {}'.format(epoch)) model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() mmap = meter.mAPMeter() AP = meter.APMeter() top = meter.ClassErrorMeter(topk=[1, 3, 5], accuracy=True) mmap.reset() AP.reset() top.reset() end_time = time.time() for i, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) if type(inputs) is list: inputs = [Variable(inputs[ii].cuda()) for ii in range(len(inputs))] else: inputs = Variable(inputs.cuda()) targets = targets.cuda() with torch.no_grad(): #inputs = Variable(inputs) targets = Variable(targets) outputs ,context= model(inputs) #if i %5==0: #for jj in range(num): # org_img = inverse_normalize(inputs[0,jj,:,:,:].detach().cpu().numpy()) # show_keypoint(org_img, context[0].detach().cpu(),vis=vis,title = str(jj+1)) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) losses.update(loss.data.item(), targets.detach().size(0)) accuracies.update(acc, targets.detach().size(0)) one_hot = torch.zeros_like(outputs).cuda().scatter_(1, targets.view(-1, 1), 1) mmap.add(outputs.detach(), one_hot.detach()) top.add(outputs.detach(), targets.detach()) AP.add(outputs.detach(), one_hot.detach()) batch_time.update(time.time() - end_time) end_time = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})\t' 'mmap {mmap}\t' 'top1 3 5: {top}\t'.format( epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies, mmap=mmap.value(), top=top.value() )) vis.text("gpu:{}, epoch: {},loss: {},accu:{},mAP:{}, top135 {}\nAP:{}".format(torch.cuda.current_device(),epoch,losses.avg,accuracies.avg,mmap.value(),top.value(),AP.value()) ,win=vallogwindow,append=True) #exit() #if epoch==10: # exit() return losses.avg, mmap.value()
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training loop for one epoch.""" losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() # For Early Exit, we define statistics for each exit # So exiterrors is analogous to classerr for the non-Early Exit case if args.earlyexit_lossweights: args.exiterrors = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) epoch_frac = args.partial_epoch steps_per_frac_epoch = math.ceil((total_samples*epoch_frac) / batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) inputs, target = inputs.to('cuda'), target.to('cuda') if train_step == steps_per_frac_epoch: break # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) if args.kd_policy is None: output = model(inputs) else: output = args.kd_policy.forward(inputs) if not args.earlyexit_lossweights: # ------------------------------------------------------------------ AHMED edit sin2-reg - April19 """ adding sin2 regularization here""" qbits_dict = {} sin2_reg_loss = 0 #print('weights:', (model.module.conv2.weight.size())) bw = 3 qbits_dict['conv1'] = bw qbits_dict['conv2'] = bw qbits_dict['fc1'] = bw qbits_dict['fc2'] = bw qbits_dict['fc3'] = bw # --------------------- #kernel = model.module.features[0].float_weight kernel1 = model.module.conv1.weight kernel2 = model.module.conv2.weight kernel3 = model.module.fc1.weight kernel4 = model.module.fc2.weight kernel5 = model.module.fc3.weight last_epoch = 999 if (train_step == last_epoch): w1 = kernel1.data.cpu().numpy() w2 = kernel2.data.cpu().numpy() w3 = kernel3.data.cpu().numpy() w4 = kernel4.data.cpu().numpy() w5 = kernel5.data.cpu().numpy() np.save('weights_sin2Reg/cifar10_L1_weights'+str(last_epoch), w1) np.save('weights_sin2Reg/cifar10_L2_weights'+str(last_epoch), w2) np.save('weights_sin2Reg/cifar10_L3_weights'+str(last_epoch), w3) np.save('weights_sin2Reg/cifar10_L4_weights'+str(last_epoch), w4) np.save('weights_sin2Reg/cifar10_L5_weights'+str(last_epoch), w5) print('++++saving weights+++++++++++++++++++++++++++') # --------------------- # ---------------------------------- q = 2 power = 2 step = 1/(2**(q)-0.5) # dorefa shift = step/2 #step = 1/(2**(q)-1) # wrpn #shift = 0 #amplitude = (np.sin(pi*(weight+step/2)/(step)))**2 step = 1/(2**(model.module.B1.clone())-0.5) # dorefa #step = 1/(2**(5)-0.5) # dorefa shift = step/2 #kernel = model.module.conv1.float_weight kernel = model.module.conv1.weight #sin2_func_1 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['conv1']))-1)),2)) sin2_func_1 =torch.mean((torch.sin(pi*(kernel+shift)/(step)))**power) # dorefa #print(sin2_func_1.data[0]) step = 1/(2**(model.module.B2.clone())-0.5) # dorefa #step = 1/(2**(3)-0.5) # dorefa shift = step/2 #kernel = model.module.conv2.float_weight kernel = model.module.conv2.weight #sin2_func_2 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['conv2']))-1)),2)) sin2_func_2 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa step = 1/(2**(model.module.B3.clone())-0.5) # dorefa #step = 1/(2**(3)-0.5) # dorefa shift = step/2 #kernel = model.module.fc1.float_weight kernel = model.module.fc1.weight #sin2_func_3 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['fc1']))-1)),2)) sin2_func_3 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa step = 1/(2**(model.module.B4.clone())-0.5) # dorefa #step = 1/(2**(3)-0.5) # dorefa shift = step/2 #kernel = model.module.fc2.float_weight kernel = model.module.fc2.weight #sin2_func_4 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['fc2']))-1)),2)) sin2_func_4 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa step = 1/(2**(model.module.B5.clone())-0.5) # dorefa #step = 1/(2**(4)-0.5) # dorefa shift = step/2 #kernel = model.module.fc3.float_weight kernel = model.module.fc3.weight #sin2_func_5 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['fc3']))-1)),2)) sin2_func_5 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa # ---------------------------------- sin2_reg_loss = sin2_func_1 + sin2_func_2 + sin2_func_3 + sin2_func_4 + sin2_func_5 freq_loss = model.module.B1 + model.module.B2 + model.module.B3 + model.module.B4 + model.module.B5 #sin2_reg_loss = sin2_func_1 + sin2_func_3 + sin2_func_4 #loss = criterion(output, target) """ settings 0 """ #if train_step > 100: # lambda_q = 1 # lambda_f = 0.05 #else: # lambda_q = 0 # lambda_f = 0 """ settings 1 """ #lambda_q = (1/torch.exp(torch.tensor(4.0))).to('cuda')*torch.exp(torch.tensor(4*int(epoch)/1000)).to('cuda')# rising1 #lambda_f = 0.05 #lambda_qp = (1/np.exp(4))*torch.exp(torch.from_numpy(np.array(4*epoch/500))).cpu().numpy().data # rising1 #lambda_fp = lambda_f """ settings 2: step-like lambda """ r = 0.2*args.epochs d = 0.8*args.epochs s = 20 f1 = 0.5 * (1+torch.tanh(torch.tensor((epoch-r)/s).to('cuda'))); f2 = 0.5 * (1+torch.tanh(torch.tensor((epoch-d)/s).to('cuda'))); lambda_q = f1 #lambda_f_value = 0.02*(f1-f2) lambda_f = 0.03 reg_loss = lambda_q * sin2_reg_loss loss = criterion(output, target) + reg_loss + (lambda_f * freq_loss) #print('sin2_reg_LOSS:', sin2_reg_loss.data[0]) #print('total_LOSS:', loss.data[0]) #print('MODEL:', (model.state_dict())) # ------------------------------------------------------------------ AHMED edit sin2-reg - April19 # Measure accuracy and record loss classerr.add(output.data, target) else: # Measure accuracy and record loss loss = earlyexit_loss(output, target, criterion, args) losses[OBJECTIVE_LOSS_KEY].add(loss.item()) #print('sin2_reg_LOSS:', sin2_reg_loss.data[0]) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % args.print_freq == 0: # Log some statistics errs = OrderedDict() if not args.earlyexit_lossweights: errs['Top1'] = classerr.value(1) errs['Top5'] = classerr.value(5) else: # for Early Exit case, the Top1 and Top5 stats are computed for each exit. for exitnum in range(args.num_exits): errs['Top1_exit' + str(exitnum)] = args.exiterrors[exitnum].value(1) errs['Top5_exit' + str(exitnum)] = args.exiterrors[exitnum].value(5) stats_dict = OrderedDict() for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Peformance/Training/', stats_dict) params = model.named_parameters() if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) end = time.time() kernel = model.module.conv1.weight #kernel = model.module.conv1.float_weight print('00000000000000000000') w1 = kernel.data.cpu().numpy() np.save('w1_cifar', w1) print('======================================', reg_loss.data[0]) print('learned bitwidths', model.module.B1.data.cpu().numpy()[0], model.module.B2.data.cpu().numpy()[0], model.module.B3.data.cpu().numpy()[0], model.module.B4.data.cpu().numpy()[0], model.module.B5.data.cpu().numpy()[0])
from torch.autograd import Variable as V from torchnet import meter from config.config import cfg from util.visualize import Visualizer from util.show_masked_image import show_masked_image from mmcv.runner import save_checkpoint, load_checkpoint import cv2 from util.show_masked_image import tensor_to_np import numpy as np #cfg.merge_from_file("config/un_att_pascal_0001.yaml") cfg.freeze() # 冻结参数 vis = Visualizer("newvis", port=8097) AP = meter.APMeter() mAP = meter.mAPMeter() top3 = meter.ClassErrorMeter(topk=[1, 3, 5], accuracy=True) Loss_meter = meter.AverageValueMeter() os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2" num = 30 def visualize_func(result): pass def inverse_normalize(img): #if opt.caffe_pretrain: # img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)) # return img[::-1, :, :] # approximate un-normalize for visualize
def main(): if not os.path.exists(opt.save): os.mkdir(opt.save) if opt.scat > 0: model, params, stats = models.__dict__[opt.model](N=opt.N, J=opt.scat) else: model, params, stats = models.__dict__[opt.model]() def create_optimizer(opt, lr): print('creating optimizer with lr = %f' % lr) return torch.optim.SGD(params.values(), lr, opt.momentum, weight_decay=opt.weightDecay) def get_iterator(mode): ds = create_dataset(opt, mode) return ds.parallel(batch_size=opt.batchSize, shuffle=mode, num_workers=opt.nthread, pin_memory=False) optimizer = create_optimizer(opt, opt.lr) iter_test = get_iterator(False) iter_train = get_iterator(True) if opt.scat > 0: scat = Scattering(M=opt.N, N=opt.N, J=opt.scat, pre_pad=False).cuda() epoch = 0 if opt.resume != '': resumeFile = opt.resume if not resumeFile.endswith('pt7'): resumeFile = torch.load(opt.resume + '/latest.pt7')['latest_file'] state_dict = torch.load(resumeFile) epoch = state_dict['epoch'] params_tensors, stats = state_dict['params'], state_dict['stats'] for k, v in params.iteritems(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('model was restored from epoch:', epoch) print('\nParameters:') print( pd.DataFrame([(key, v.size(), torch.typename(v.data)) for key, v in params.items()])) print('\nAdditional buffers:') print( pd.DataFrame([(key, v.size(), torch.typename(v)) for key, v in stats.items()])) n_parameters = sum( [p.numel() for p in list(params.values()) + list(stats.values())]) print('\nTotal number of parameters: %f' % n_parameters) meter_loss = meter.AverageValueMeter() classacc = meter.ClassErrorMeter(topk=[1, 5], accuracy=False) timer_data = meter.TimeMeter('s') timer_sample = meter.TimeMeter('s') timer_train = meter.TimeMeter('s') timer_test = meter.TimeMeter('s') def h(sample): inputs = sample[0].cuda() if opt.scat > 0: inputs = scat(inputs) inputs = Variable(inputs) targets = Variable(sample[1].cuda().long()) if sample[2]: model.train() else: model.eval() y = torch.nn.parallel.data_parallel(model, inputs, np.arange(opt.ngpu).tolist()) return F.cross_entropy(y, targets), y def log(t, state): if (t['epoch'] > 0 and t['epoch'] % opt.frequency_save == 0): torch.save( dict(params={k: v.data.cpu() for k, v in params.iteritems()}, stats=stats, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), open(os.path.join(opt.save, 'epoch_%i_model.pt7' % t['epoch']), 'w')) torch.save( dict( latest_file=os.path.join(opt.save, 'epoch_%i_model.pt7' % t['epoch'])), open(os.path.join(opt.save, 'latest.pt7'), 'w')) z = vars(opt).copy() z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): global data_time data_time = timer_data.value() timer_sample.reset() state['sample'].append(state['train']) def on_forward(state): prev_sum5 = classacc.sum[5] prev_sum1 = classacc.sum[1] classacc.add(state['output'].data, torch.LongTensor(state['sample'][1])) meter_loss.add(state['loss'].data[0]) next_sum5 = classacc.sum[5] next_sum1 = classacc.sum[1] n = state['output'].data.size(0) curr_top5 = 100.0 * (next_sum5 - prev_sum5) / n curr_top1 = 100.0 * (next_sum1 - prev_sum1) / n sample_time = timer_sample.value() timer_data.reset() if (state['train']): txt = 'Train:' else: txt = 'Test' if (state['t'] % opt.frequency_print == 0 and state['t'] > 0): print( '%s [%i,%i/%i] ; loss: %.3f (%.3f) ; acc5: %.2f (%.2f) ; acc1: %.2f (%.2f) ; data %.3f ; time %.3f' % (txt, state['epoch'], state['t'] % len(state['iterator']), len(state['iterator']), state['loss'].data[0], meter_loss.value()[0], curr_top5, classacc.value(5), curr_top1, classacc.value(1), data_time, sample_time)) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = iter_train epoch = state['epoch'] + 1 if epoch in epoch_step: print('changing LR') lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): if (state['t'] % opt.frequency_test == 0 and state['t'] > 0): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, iter_test) log( { "train_loss": train_loss[0], "train_acc": 100 - train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": 100 - classacc.value()[0], "epoch": state['epoch'], "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, iter_train, opt.epochs, optimizer)
def _validate(data_group, model, criterion, device): # Open source accelerate package! classerr = tnt.ClassErrorMeter(accuracy=True, topk=[1, 5]) # Remove top 5. losses = {'objective_loss': tnt.AverageValueMeter()} """ if _is_earlyexit(args): # for Early Exit, we have a list of errors and losses for each of the exits. args.exiterrors = [] args.losses_exits = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) args.losses_exits.append(tnt.AverageValueMeter()) args.exit_taken = [0] * args.num_exits """ batch_time = tnt.AverageValueMeter() total_samples = len(dataloaders[data_group].sampler) batch_size = dataloaders[data_group].batch_size total_steps = total_samples / batch_size # Display confusion option should be implmented in the near future. """ if args.display_confusion: confusion = tnt.ConfusionMeter(args.num_classes """ # Turn into evaluation model. model.eval() end = time.time() # Starting primiary teating code here. with torch.no_grad(): for validation_step, data in enumerate(dataloaders[data_group]): inputs = data[0].to(device) labels = data[1].to(device) output = model(inputs) # Neglect elary exist mode in the first version. ''' if not _is_earlyexit(args): # compute loss loss = criterion(output, target) # measure accuracy and record loss losses['objective_loss'].add(loss.item()) classerr.add(output.detach(), target) if args.display_confusion: confusion.add(output.detach(), target) else: earlyexit_validate_loss(output, target, criterion, args) ''' loss = criterion(output, labels) losses['objective_loss'].add(loss.item()) classerr.add(output.detach(), labels) steps_completed = (validation_step + 1) batch_time.add(time.time() - end) end = time.time() steps_completed = (validation_step + 1) #Record log using _log_validation_progress function # "\033[0;37;40m\tExample\033[0m" if steps_completed % 200. == 0: print('Test [{:5d}/{:5d}] \033[0;37;41mLoss {:.5f}\033[0' '\033[0;37;42m\tTop1 {:.5f} Top5 {:.5f}\033[m' '\tTime {:.5f}.'.format(steps_completed, int(total_steps), losses['objective_loss'].mean, classerr.value(1), classerr.value(5), batch_time.mean)) print('==> \033[0;37;42mTop1 {:.5f} Top5 {:.5f}\033[m' '\033[0;37;41m\tLoss: {:.5f}\n\033[m.'.format( classerr.value(1), classerr.value(5), losses['objective_loss'].mean)) return classerr.value(1), classerr.value(5), losses['objective_loss'].mean
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, print_freq, log_params_hist, teacher_model=None, temperature_distillation=2, weight_distillation_loss=0.7): """Training loop for one epoch. If teacher_model is not None, distillation will be used""" losses = {'objective_loss': tnt.AverageValueMeter(), 'regularizer_loss': tnt.AverageValueMeter()} if compression_scheduler is None: # Initialize the regularizer loss to zero losses['regularizer_loss'].add(0) if teacher_model is not None: softmax_function = nn.Softmax(dim=1).cuda() log_softmax_function = nn.LogSoftmax(dim=1).cuda() kldiv_loss = nn.KLDivLoss(size_average=False).cuda() # see https://github.com/pytorch/pytorch/issues/6622 classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(inputs) target_var = torch.autograd.Variable(target) # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) output = model(input_var) loss = criterion(output, target_var) if teacher_model is not None: with PytorchNoGrad(): input_var_teacher = get_inference_var(inputs) output_teacher = teacher_model(input_var_teacher) loss_distilled = (temperature_distillation**2) * kldiv_loss( log_softmax_function(output / temperature_distillation), softmax_function(output_teacher / temperature_distillation)) / output.size(0) loss = weight_distillation_loss*loss_distilled + (1-weight_distillation_loss)*loss # Measure accuracy and record loss classerr.add(output.data, target) losses['objective_loss'].add(loss.item()) if compression_scheduler: # Before running the backward phase, we add any regularization loss computed by the scheduler regularizer_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss, optimizer) loss += regularizer_loss losses['regularizer_loss'].add(regularizer_loss.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % print_freq == 0: # Log some statistics lr = optimizer.param_groups[0]['lr'] stats = ('Peformance/Training/', OrderedDict([ ('Loss', losses['objective_loss'].mean), ('Reg Loss', losses['regularizer_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5)), ('LR', lr), ('Time', batch_time.mean)])) distiller.log_training_progress(stats, model.named_parameters() if log_params_hist else None, epoch, steps_completed, steps_per_epoch, print_freq, loggers) end = time.time()
def light_train_with_distiller(model, criterion, optimizer, compress_scheduler, device, epoch=1): total_samples = dataset_sizes['train'] batch_size = dataloaders["train"].batch_size steps_per_epoch = math.ceil(total_samples / batch_size) classerr = tnt.ClassErrorMeter(accuracy=True, topk=[ 1, 5 ]) # It seems that binary can not use top5 accuracy (topk=[1,5]). batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() OVERALL_LOSS_KEY = 'Overall Loss' OBJECTIVE_LOSS_KEY = 'Objective Loss' losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) model.train() acc_stats = [] end = time.time() for train_step, data in enumerate(dataloaders["train"], 0): inputs = data[0].to(device) labels = data[1].to(device) if compress_scheduler: compress_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) output = model(inputs) loss = criterion(output, labels) # Drop the early exist mode in this first version classerr.add(output.detach(), labels) acc_stats.append([classerr.value(1), classerr.value(5)]) losses[OBJECTIVE_LOSS_KEY].add(loss.item()) """ if not early_exit_mode(args): loss = criterion(output, target) # Measure accuracy classerr.add(output.detach(), target) acc_stats.append([classerr.value(1), classerr.value(5)]) else: # Measure accuracy and record loss classerr.add(output[args.num_exits-1].detach(), target) # add the last exit (original exit) loss = earlyexit_loss(output, target, criterion, args) """ if compress_scheduler: agg_loss = compress_scheduler.before_backward_pass( epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) # should by modified, this may incorporated in the future. loss = agg_loss.overall_loss """ for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) """ loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) else: losses[OVERALL_LOSS_KEY].add(loss.item()) optimizer.zero_grad() loss.backward() if compress_scheduler: compress_scheduler.before_parameter_optimization( epoch, train_step, steps_per_epoch, optimizer) optimizer.step() if compress_scheduler: compress_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) batch_time.add(time.time() - end) steps_completed = (train_step + 1) # "\033[0;37;40m\tExample\033[0m" if steps_completed % 1000 == 0: print( 'Epoch: [{}][{:5d}/{:5d}] \033[0;37;41mOverall Loss {:.5f} Objective Loss {:.5f}\033[0m' '\033[0;37;42m\tTop 1 {:.5f} Top 5 {:.5f}\033[0m' '\033[0;37;40m\tLR {:.5f} Time {:.5f}\033[0m.'.format( epoch, steps_completed, int(steps_per_epoch), losses['Overall Loss'].mean, losses['Objective Loss'].mean, classerr.value(1), classerr.value(5), optimizer.param_groups[0]['lr'], batch_time.mean)) t, total = summary.weights_sparsity_tbl_summary( net, return_total_sparsity=True) print('Total sparsity: {:0.2f}\n'.format(total)) #df = summary.masks_sparsity_tbl_summary(net, compress_scheduler) #print(df) end = time.time() return classerr.value(1), classerr.value(5), losses[ OVERALL_LOSS_KEY] # classerr.vlaue(5)
def train(args, model, dataloaders, criterion, optimizer, scheduler, logger, epochs=25, is_inception=False): """ args: 从键盘接收的参数 model: 将被训练的模型 dataloaders: 数据加载器 criterion: 损失函数 optimizer: 训练时的优化器 scheduler: 学习率调整机制 logger: 日志 epochs: 训练周期数 is_inception: 是否为inception模型的标志 """ # 训练周期数 epochs = epochs or args.epochs # 模型保存地址 if args.pretrained and args.feature: mode = "feature_extractor" # pretrained=True, feature=True elif args.pretrained and not args.feature: mode = "fine_tuning" # pretrained=True, feature=False else: mode = "from_scratch" # pretrained=False, feature=False # 模型保存地址 model_path = Path(args.output) / args.arch / mode / "model.pt" # 准确率最好的模型保存地址 best_modelpath = Path(args.output) / args.arch / mode / "bestmodel.pt" # 断点训练 if (model_path.exists()): state = torch.load(str(model_path)) epoch = state["epoch"] model.load_state_dict(state["model"]) best_acc = state["best_acc"] logger.info("Loading epoch {} checkpoint ...".format(epoch)) print("Restored model, epoch {}".format(epoch)) else: epoch = 0 best_acc = float('inf') # save匿名函数,使用的时候就调用save(ep) save = lambda epoch: torch.save({ "model":model.state_dict(), "epoch":epoch, "best_acc": best_acc, }, str(model_path)) # 训练指标 running_loss_meter = meter.AverageValueMeter() # 平均值loss # running_acc_meter = meter.mAPMeter() # 所有类的平均正确率 running_acc_meter = meter.ClassErrorMeter(topk=[1], accuracy=True) # 准确率 time_meter = meter.TimeMeter(unit=True) # 测量训练时间 # 结果记录文件 resultpath = Path(args.output) / args.arch / mode / "train_result.pkl" result_writer = ResultsWriter(resultpath, overwrite=False) for epoch in range(epoch, epochs): print("Epoch {}/{}".format(epoch, epochs-1)) print("-" * 10) # 每个epoch都有一个训练和验证阶段 for phase in ["train", "val"]: if phase == "train": model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode # 每个epoch的train和val阶段分别重置 running_loss_meter.reset() running_acc_meter.reset() random.seed(args.seed) tq = tqdm.tqdm(total=len(dataloaders[phase].datasets)) tq.set_description("{} for Epoch {}/{}".format(phase, epoch+1, epochs)) try: # 迭代数据 for inputs, labels in dataloaders[phase]: # 将输入和标签放入gpu或者cpu中 inputs = inputs.cuda() if torch.cuda.is_available() else inputs labels = labels.cuda() if torch.cuda.is_available() else labels # 零参数梯度 optimizer.zero_grad() # 前向 # track history if only in train with torch.set_grad_enabled(phase=="train"): # inception的训练和验证有区别 if is_inception and phase == "train": outputs, aux_outputs = model(inpus) loss1 = criterion(outputs, labels) loss2 = criterion(aux_outputs, labels) loss = loss1 + 0.4 * loss2 else: outputs = model(inputs) loss = criterion(outputs, labels) # 计算loss # backward + optimize only if in training phase if phase == "train": # 反向传播 loss.backward() # 更新权值参数 optimizer.step() tq.update(inputs.size(0)) # 一次迭代(step)的更新 running_loss_meter.add(loss.item()) running_acc_meter.add(F.softmax(output.detach(), dim=1), labels.detach()) # 学习率调整(按epoch调整) if phase == "train": # 更新学习率 scheduler.step() save(epoch+1) tq.close() print("{} Loss: {:.4f} Acc: {:.4f}".format(phase, running_loss_meter.value()[0], running_acc_meter.value())) # copy the bestmodel if phase == "val" and running_acc_meter.value() > best_acc: best_acc = running_acc_meter.value() shutil.copy(str(model_path), str(best_modelpath)) """记录epoch的loss和acc,不记录step的""" # 记录到日志中 logger.info("\n phase: {phase}, epoch: {epoch}, lr: {lr}, loss: {loss}, acc: {acc}".format( phase = phase, epoch = epoch+1, lr = scheduler.get_lr(), loss = running_loss_meter.value()[0], acc = running_acc_meter.value())) # ResultWriter记录 result_writer.update(epoch, {"phase":phase, "loss": running_loss_meter.value()[0], "acc":running_acc_meter.value()}) except KeyboardInterrupt: tq.close() print("Ctrl+C", saving snapshot) save(epoch) print() # 训练所用时间 time_elapsed = time_meter.value() print("Training complete in {:.0f}m {:.0f}s".format(time_elapsed, time_elapsed)) print("Best val Acc: {:.4f}".format(best_acc))
def main(): model, params, stats = models.__dict__[opt.model](N=opt.N, J=opt.scat) iter_test = get_iterator(False, opt) scat = Scattering(M=opt.N, N=opt.N, J=opt.scat, pre_pad=False).cuda() epoch = 0 if opt.resume != '': resumeFile = opt.resume if not resumeFile.endswith('pt7'): resumeFile = torch.load(opt.resume + '/latest.pt7')['latest_file'] state_dict = torch.load(resumeFile) model.load_state_dict(state_dict['state_dict']) print('model was restored from epoch:', epoch) print('\nParameters:') print( pd.DataFrame([(key, v.size(), torch.typename(v.data)) for key, v in params.items()])) print('\nAdditional buffers:') print( pd.DataFrame([(key, v.size(), torch.typename(v)) for key, v in stats.items()])) n_parameters = sum( [p.numel() for p in list(params.values()) + list(stats.values())]) print('\nTotal number of parameters: %f' % n_parameters) meter_loss = meter.AverageValueMeter() classacc = meter.ClassErrorMeter(topk=[1, 5], accuracy=False) timer_data = meter.TimeMeter('s') timer_sample = meter.TimeMeter('s') timer_train = meter.TimeMeter('s') timer_test = meter.TimeMeter('s') def h(sample): inputs = sample[0].cuda() if opt.scat > 0: inputs = scat(inputs) inputs = Variable(inputs) targets = Variable(sample[1].cuda().long()) if sample[2]: model.train() else: model.eval() # y = model.forward(inputs) y = torch.nn.parallel.data_parallel(model, inputs, np.arange(opt.ngpu).tolist()) return F.cross_entropy(y, targets), y def on_sample(state): global data_time data_time = timer_data.value() timer_sample.reset() state['sample'].append(state['train']) def on_forward(state): prev_sum5 = classacc.sum[5] prev_sum1 = classacc.sum[1] classacc.add(state['output'].data, torch.LongTensor(state['sample'][1])) meter_loss.add(state['loss'].data[0]) next_sum5 = classacc.sum[5] next_sum1 = classacc.sum[1] n = state['output'].data.size(0) curr_top5 = 100.0 * (next_sum5 - prev_sum5) / n curr_top1 = 100.0 * (next_sum1 - prev_sum1) / n sample_time = timer_sample.value() timer_data.reset() if (state['train']): txt = 'Train:' else: txt = 'Test' print( '%s [%i,%i/%i] ; loss: %.3f (%.3f) ; err5: %.2f (%.2f) ; err1: %.2f (%.2f) ; data %.3f ; time %.3f' % (txt, state['epoch'], state['t'] % len(state['iterator']), len(state['iterator']), state['loss'].data[0], meter_loss.value()[0], curr_top5, classacc.value(5), curr_top1, classacc.value(1), data_time, sample_time)) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() epoch = state['epoch'] + 1 def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, iter_test) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.test(h, iter_test) print(classacc.value())
def train(**kwargs): global device, vis if opt.seed is not None: setup_seed(opt.seed) config_str = opt.parse(kwargs) device = torch.device("cuda" if opt.use_gpu else "cpu") vis = Visualizer(opt.log_dir, opt.model, current_time, opt.title_note) # log all configs vis.log('config', config_str) # load data set train_loader, val_loader, num_classes = getattr(dataset, opt.dataset)(opt.batch_size * opt.gpus) # load model model = getattr(models, opt.model)(lambas=opt.lambas, num_classes=num_classes, weight_decay=opt.weight_decay).to( device) if opt.gpus > 1: model = nn.DataParallel(model) # define loss function def criterion(output, target_var): loss = nn.CrossEntropyLoss().to(device)(output, target_var) reg_loss = model.regularization() if opt.gpus <= 1 else model.module.regularization() total_loss = (loss + reg_loss).to(device) return total_loss # load optimizer and scheduler if opt.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters() if opt.gpus <= 1 else model.module.parameters(), opt.lr) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=opt.lr_decay, patience=15) scheduler = None print('Optimizer: Adam, lr={}'.format(opt.lr)) elif opt.optimizer == 'momentum': optimizer = torch.optim.SGD(model.parameters() if opt.gpus <= 1 else model.module.parameters(), opt.lr, momentum=opt.momentum, nesterov=True) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.schedule_milestone, gamma=opt.lr_decay) print('Optimizer: Momentum, lr={}, momentum'.format(opt.lr, opt.momentum)) else: print('No optimizer') return loss_meter = meter.AverageValueMeter() accuracy_meter = meter.ClassErrorMeter(accuracy=True) # create checkpoints dir directory = '{}/{}_{}'.format(opt.checkpoints_dir, opt.model, current_time) if not os.path.exists(directory): os.makedirs(directory) total_steps = 0 for epoch in range(opt.start_epoch, opt.max_epoch) if opt.verbose else tqdm(range(opt.start_epoch, opt.max_epoch)): model.train() if opt.gpus <= 1 else model.module.train() loss_meter.reset() accuracy_meter.reset() for ii, (input_, target) in enumerate(train_loader): input_, target = input_.to(device), target.to(device) optimizer.zero_grad() score = model(input_, target) loss = criterion(score, target) loss.backward() optimizer.step() loss_meter.add(loss.cpu().data) accuracy_meter.add(score.data, target.data) e_fl, e_l0 = model.get_exp_flops_l0() if opt.gpus <= 1 else model.module.get_exp_flops_l0() vis.plot('stats_comp/exp_flops', e_fl, total_steps) vis.plot('stats_comp/exp_l0', e_l0, total_steps) total_steps += 1 if (model.beta_ema if opt.gpus <= 1 else model.module.beta_ema) > 0.: model.update_ema() if opt.gpus <= 1 else model.module.update_ema() if ii % opt.print_freq == opt.print_freq - 1: vis.plot('train/loss', loss_meter.value()[0]) vis.plot('train/accuracy', accuracy_meter.value()[0]) if opt.verbose: print("epoch:{epoch},lr:{lr},loss:{loss:.2f},train_acc:{train_acc:.2f}" .format(epoch=epoch, loss=loss_meter.value()[0], train_acc=accuracy_meter.value()[0], lr=optimizer.param_groups[0]['lr'])) # save model if epoch % 10 == 0 or epoch == opt.max_epoch - 1: torch.save(model.state_dict(), directory + '/{}.model'.format(epoch)) # validate model val_accuracy, val_loss = val(model, val_loader, criterion) vis.plot('val/loss', val_loss) vis.plot('val/accuracy', val_accuracy) # update lr if scheduler is not None: if isinstance(optimizer, torch.optim.lr_scheduler.ReduceLROnPlateau): scheduler.step(val_loss) else: scheduler.step(epoch) if opt.verbose: print("epoch:{epoch},lr:{lr},loss:{loss:.2f},val_acc:{val_acc:.2f},prune_rate:{pr:.2f}" .format(epoch=epoch, loss=loss_meter.value()[0], val_acc=val_accuracy, lr=optimizer.param_groups[0]['lr'], pr=model.prune_rate() if opt.gpus <= 1 else model.module.prune_rate())) for (i, num) in enumerate(model.get_expected_activated_neurons() if opt.gpus <= 1 else model.module.get_expected_activated_neurons()): vis.plot("Training_layer/{}".format(i), num) vis.plot('lr', optimizer.param_groups[0]['lr'])
def test(models, weights, gpu_ids, iterator, topk, num_classes, enviroment='main'): print( '=========================Start Testing at {}===========================' .format(time.strftime('%c'))) # TODO: serialization classerr_meters = [meter.ClassErrorMeter(topk) for i in models] ap_meters = [APMeter(num_classes) for i in models] # multiple gpu support if gpu_ids is not None: for i in range(len(models)): models[i].cuda(gpu_ids[0]) models[i] = torch.nn.DataParallel(models[i], device_ids=gpu_ids) # set eval() to freeze running mean and running var for m in models: m.eval() with torch.no_grad(): for sample in tqdm(iterator): # wrap data for i in range(2): if gpu_ids is not None: sample[i].cuda(gpu_ids[0], non_blocking=True) ipt, target = sample[0], sample[1] opt = None for i in range(len(models)): if opt is None: opt = weights[i] * functional.softmax(models[i](ipt)) else: opt += weights[i] * functional.softmax(models[i](ipt)) classerr_meters[i].add(opt.data, target.data) ap_meters[i].add(opt.data, target.data) # sorting w.r.t the first weak learner index = numpy.argsort(ap_meters[0].value()) classerrs = [] for i in topk: classerrs.append([meter.value(i) for meter in classerr_meters]) ap = [meter.value()[index] for meter in ap_meters] ap = numpy.stack(ap) x = [ numpy.linspace(0, num_classes, num=num_classes, endpoint=False) for i in ap_meters ] x = numpy.stack(x) vis = visdom.Visdom(server='http://localhost', env=enviroment) vis.line(X=x.transpose(), Y=ap.transpose(), opts={'title': 'Class AP'}) for i in range(len(topk)): vis.line(numpy.asarray(classerrs[i]), opts={'title': 'Class Top {} Error'.format(topk[i])}) print( '========================Testing Down at {} ===========================' .format(time.strftime('%c'))) print('******************Top {} Error: {}*****************'.format( topk, classerrs))