def train_one_epoch_mixup(model, train_loader, criterion, optimizer, epoch, lr_scheduler, logger, loss_record, scaler, args): loss_record.reset() tic = time.time() model.train() for i, (data, labels) in enumerate(train_loader): data = data.cuda(args.gpu, non_blocking=True) labels = labels.cuda(args.gpu, non_blocking=True) data, labels_a, labels_b, lam = mixup_data(data, labels, args.mixup_alpha) optimizer.zero_grad() with autocast(args.mix_precision_training): outputs = model(data) loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() loss_record.update(loss) lr_scheduler.step() if i % args.log_interval == 0 and i != 0: logger.info( 'Epoch {}, Node {}, GPU {}, Iter {}, {}:{:.5}, {} samples/s, lr: {:.5}.' .format(epoch, args.world, args.gpu, i, loss_record.name, loss_record.get(), int((i * args.batch_size) // (time.time() - tic)), lr_scheduler.learning_rate))
def train(net, train_loader, optimizer, criterion): losses = AverageMeter() accs = AverageMeter() net.train() alpha = 0.1 for step, (x, y) in enumerate(tqdm(train_loader)): x = x.to(device) y = y.to(device)#.unsqueeze(-1).float() if params['mixup']: x, y_a, y_b, lam = mixup_data(x, y, alpha) y_ = net(x) loss = mixup_criterion(criterion, y_, y_a, y_b, lam) else: y_ = net(x) loss = criterion(y_, y) acc = cal_acc(y_, y) optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item()) accs.update(acc.item()) return losses.avg, accs.avg
def train_mixup(): mixup_off_epoch = epochs if args.mixup_off_epoch == 0 else args.mixup_off_epoch for epoch in range(resume_epoch, epochs): train_sampler.set_epoch(epoch) loss_record.reset() alpha = args.mixup_alpha if epoch < mixup_off_epoch else 0 tic = time.time() model.train() for i, (data, labels) in enumerate(train_data): data = data.to(device, non_blocking=True) labels = labels.to(device, non_blocking=True) data, labels_a, labels_b, lam = mixup_data(data, labels, alpha) optimizer.zero_grad() outputs = model(data) loss = mixup_criterion(Loss, outputs, labels_a, labels_b, lam) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() loss_record.update(loss) lr_scheduler.step() if i % args.log_interval == 0 and i != 0: logger.info( 'Epoch {}, Iter {}, {}:{:.5}, {} samples/s.'.format( epoch, i, loss_record.name, loss_record.get(), int((i * batch_size) // (time.time() - tic)))) train_speed = int(num_training_samples // (time.time() - tic)) train_msg = 'Train Epoch {}: {}:{:.5}, {} samples/s, lr:{:.5}'.format( epoch, loss_record.name, loss_record.get(), train_speed, lr_scheduler.learning_rate) logger.info(train_msg) test(epoch)
def train(epoch): print('\nEpoch: %d' % epoch) snet.train() decoder.train() train_loss = 0 train_cls_loss = 0 conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES)) if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = args.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (img_teacher, img_student, target) in enumerate(trainloader): if args.cuda: img_teacher = img_teacher.cuda(non_blocking=True) img_student = img_student.cuda(non_blocking=True) target = target.cuda(non_blocking=True) optimizer.zero_grad() if args.augmentation: img_teacher, teacher_target_a, teacher_target_b, teacher_lam = mixup_data( img_teacher, target, 0.6) img_teacher, teacher_target_a, teacher_target_b = map( Variable, (img_teacher, teacher_target_a, teacher_target_b)) img_student, student_target_a, student_target_b, student_lam = mixup_data( img_student, target, 0.6) img_student, student_target_a, student_target_b = map( Variable, (img_student, student_target_a, student_target_b)) else: img_teacher, img_student, target = Variable(img_teacher), Variable( img_student), Variable(target) rb1_s, rb2_s, rb3_s, mimic_s, out_s = snet(img_student) rb1_t, rb2_t, rb3_t, mimic_t, out_t = tnet(img_teacher) if args.augmentation: cls_loss = mixup_criterion(Cls_crit, out_s, student_target_a, student_target_b, student_lam) else: cls_loss = Cls_crit(out_s, target) kd_loss = KD_T_crit(out_t, out_s) if args.distillation == 'KD': loss = 0.2 * cls_loss + 0.8 * kd_loss elif args.distillation == 'DE': new_rb1_s = decoder(rb1_s) decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(), MSE_crit) loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.1 * decoder_loss elif args.distillation == 'AS': rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda() loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.9 * rb2_loss elif args.distillation == 'DEAS': new_rb1_s = decoder(rb1_s) decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(), MSE_crit) rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda() loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.1 * decoder_loss + 0.9 * rb2_loss elif args.distillation == 'SSDEAS': new_rb1_s = decoder(rb1_s) decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(), MSE_crit) rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda() loss = 0 * cls_loss + 0 * kd_loss + 0.1 * decoder_loss + 0.9 * rb2_loss else: raise Exception('Invalid distillation name...') loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() train_cls_loss += cls_loss.item() if args.augmentation: conf_mat_a += losses.confusion_matrix(out_s, student_target_a, NUM_CLASSES) acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0]) ]) / conf_mat_a.sum() precision_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) recall_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) mAP_a = sum(precision_a) / len(precision_a) F1_score_a = (2 * precision_a * recall_a / (precision_a + recall_a + 1e-10)).mean() conf_mat_b += losses.confusion_matrix(out_s, student_target_b, NUM_CLASSES) acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0]) ]) / conf_mat_b.sum() precision_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) recall_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) mAP_b = sum(precision_b) / len(precision_b) F1_score_b = (2 * precision_b * recall_b / (precision_b + recall_b + 1e-10)).mean() acc = student_lam * acc_a + (1 - student_lam) * acc_b mAP = student_lam * mAP_a + (1 - student_lam) * mAP_b F1_score = student_lam * F1_score_a + (1 - student_lam) * F1_score_b else: conf_mat += losses.confusion_matrix(out_s, target, NUM_CLASSES) acc = sum([conf_mat[i, i] for i in range(conf_mat.shape[0])]) / conf_mat.sum() precision = [ conf_mat[i, i] / (conf_mat[i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] mAP = sum(precision) / len(precision) recall = [ conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] precision = np.array(precision) recall = np.array(recall) f1 = 2 * precision * recall / (precision + recall + 1e-10) F1_score = f1.mean() #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%' #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score)) return train_cls_loss / (batch_idx + 1), 100. * acc, 100. * mAP, 100 * F1_score
def train_base(model, cost, optimizer, train_loader, test_loader, args): ''' :param model: 要训练的模型 :param cost: 损失函数 :param optimizer: 优化器 :param train_loader: 测试数据装载 :param test_loader: 训练数据装载 :param args: 配置参数 :return: ''' # 打印训练参数 print(args) # 初始化,打开定时器,创建保存位置 start = time.time() if not os.path.exists(args.model_root): os.makedirs(args.model_root) if not os.path.exists(args.log_root): os.makedirs(args.log_root) models_dir = args.model_root + '/' + args.model_name log_dir = args.log_root + '/' + args.log_name # 保存argv参数 with open(log_dir, 'a+', newline='') as f: my_writer = csv.writer(f) args_dict = vars(args) for key, value in args_dict.items(): my_writer.writerow([key, value]) f.close() # 判断是否使用余弦衰减 if args.lrcos: print("lrcos is using") cos_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.train_epoch, eta_min=0) # 是否使用学习率预热 if args.lr_warmup: scheduler_warmup = GradualWarmupScheduler(args, optimizer, multiplier=1, after_scheduler=cos_scheduler) # 训练初始化 epoch_num = args.train_epoch # 训练多少个epoch log_interval = args.log_interval # 每隔多少个batch打印一次状态 save_interval = args.save_interval # 每隔多少个epoch 保存一次数据 batch_num = 0 train_loss = 0 log_list = [] # 需要保存的log数据列表 epoch = 0 accuracy_best = 0 # 如果是重新训练的过程,那么就读取之前训练的状态 if args.retrain: if not os.path.exists(models_dir): print("no trained model") else: state_read = torch.load(models_dir) model.load_state_dict(state_read['model_state']) optimizer.load_state_dict(state_read['optim_state']) epoch = state_read['Epoch'] print("retaining") # 训练 while epoch < epoch_num: # 以epoch为单位进行循环 for batch_idx, (data, target) in enumerate( tqdm(train_loader, desc="Epoch {}/{}".format(epoch, epoch_num))): batch_num += 1 if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() if args.mixup: mixup_alpha = args.mixup_alpha inputs, labels_a, labels_b, lam = mixup_data(data, target, alpha=mixup_alpha) optimizer.zero_grad() # 优化器梯度初始化为零,不然会累加之前的梯度 output = model(data) # 把数据输入网络并得到输出,即进行前向传播 if args.mixup: loss = mixup_criterion(cost, output, labels_a, labels_b, lam) else: loss = cost(output, target) train_loss += loss.item() loss.backward() # 反向传播求出输出到每一个节点的梯度 optimizer.step() # 根据输出到每一个节点的梯度,优化更新参数``````````````````````````````````````````````````` # if batch_idx % log_interval == 0: # 准备打印相关信息,args.log_interval是最开头设置的好了的参数 # print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( # epoch, batch_idx * len(data), len(train_loader.dataset), # 100. * batch_idx / len(train_loader), loss.item())) # 测试模型 accuracy_test = calc_accuracy(model, loader=test_loader) if accuracy_test > accuracy_best: accuracy_best = accuracy_test log_list.append(train_loss / len(train_loader)) log_list.append(accuracy_test) log_list.append(accuracy_best) print( "Epoch {}, loss={:.5f}, accuracy_test={:.5f}, accuracy_best={:.5f}".format(epoch, train_loss / len(train_loader), accuracy_test, accuracy_best)) train_loss = 0 if args.lrcos: if args.lr_warmup: scheduler_warmup.step(epoch=epoch) else: cos_scheduler.step(epoch=epoch) if epoch < 20: print(epoch, optimizer.param_groups[0]['lr']) # 保存模型和优化器参数 if epoch % save_interval == 0: train_state = { "Epoch": epoch, "model_state": model.state_dict(), "optim_state": optimizer.state_dict(), "args": args } torch.save(train_state, models_dir) # 保存log with open(log_dir, 'a+', newline='') as f: # 训练结果 my_writer = csv.writer(f) my_writer.writerow(log_list) log_list = [] epoch = epoch + 1 train_duration_sec = int(time.time() - start) print("training is end", train_duration_sec)
def train(train_loader, model, criterion, optimizer, epoch, args, TB): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') progress = ProgressMeter(len(train_loader), [batch_time, data_time, losses, top1], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() end = time.time() # for i, (images, target) in enumerate(train_loader): for i, data in enumerate(train_loader): images = data['image'] name = data['name'] target = data['label'] target = labelshaper(target, args.multi) # measure data loading time data_time.update(time.time() - end) if args.mixup > 0: left_, right_ = images mixed_images, labels_a, labels_b, lam = mixup_data( torch.cat((left_, right_), dim=1), target, args.mixup) left_ = mixed_images[:, 0].unsqueeze(1) right_ = mixed_images[:, 1].unsqueeze(1) images = [left_, right_] if args.gpu is not None: left_, right_ = images left_, right_ = left_.cuda( args.gpu, non_blocking=True), right_.cuda(args.gpu, non_blocking=True) if args.mixup > 0: mixed_images, labels_a, labels_b, lam = mixup_data( torch.cat((left_, right_), dim=1), target, args.mixup) left_ = mixed_images[:, 0].unsqueeze(1) right_ = mixed_images[:, 1].unsqueeze(1) images = [left_, right_] target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(*images) if args.mixup > 0: loss = mixup_criterion(criterion, output, labels_a.cuda(), labels_b.cuda(), lam) else: loss = criterion(output, target) # measure accuracy and record loss acc1 = accuracy(output, target, topk=(1, )) losses.update(loss.item(), target.size(0)) top1.update(acc1[0], target.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) # save for calculate auc score => One class problem if i == 0: targets, outputs = target, output elif i != 0: targets = torch.cat((targets, target)) outputs = torch.cat((outputs, output)) # TensorBoard update TB.update('Accuracy/Train', top1.avg, epoch) TB.update('Loss/Train', losses.avg, epoch) TB.update('AUC/Train', auc(outputs, targets), epoch)
def train_pixel_supervise(model, cost, optimizer, train_loader, test_loader, args): ''' :param model: :param cost: :param optimizer: :param train_loader: :param test_loader: :param args: :return: ''' print(args) # Initialize and open timer start = time.time() if not os.path.exists(args.model_root): os.makedirs(args.model_root) if not os.path.exists(args.log_root): os.makedirs(args.log_root) models_dir = args.model_root + '/' + args.name + '.pth' log_dir = args.log_root + '/' + args.name + '.csv' # save args with open(log_dir, 'a+', newline='') as f: my_writer = csv.writer(f) args_dict = vars(args) for key, value in args_dict.items(): my_writer.writerow([key, value]) f.close() # Cosine learning rate decay if args.lrcos: print("lrcos is using") cos_scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.train_epoch, eta_min=0) if args.lr_warmup: scheduler_warmup = GradualWarmupScheduler( args, optimizer, multiplier=1, after_scheduler=cos_scheduler) # Training initialization epoch_num = args.train_epoch log_interval = args.log_interval save_interval = args.save_interval batch_num = 0 train_loss = 0 epoch = 0 loss_best = 1e4 log_list = [] # log need to save if args.retrain: if not os.path.exists(models_dir): print("no trained model") else: state_read = torch.load(models_dir) model.load_state_dict(state_read['model_state']) optimizer.load_state_dict(state_read['optim_state']) epoch = state_read['Epoch'] print("retaining") # Train while epoch < epoch_num: for batch_idx, (data, target) in enumerate( tqdm(train_loader, desc="Epoch {}/{}".format(epoch, epoch_num))): batch_num += 1 target = torch.unsqueeze(target, dim=1) if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() if args.mixup: mixup_alpha = args.mixup_alpha inputs, labels_a, labels_b, lam = mixup_data(data, target, alpha=mixup_alpha) optimizer.zero_grad() output = model(data) if args.mixup: loss = mixup_criterion(cost, output, labels_a, labels_b, lam) else: loss = cost(output, target) train_loss += loss.item() loss.backward() optimizer.step() # if batch_idx % log_interval == 0: # 准备打印相关信息,args.log_interval是最开头设置的好了的参数 # print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( # epoch, batch_idx * len(data), len(train_loader.dataset), # 100. * batch_idx / len(train_loader), loss.item())) # testing # no testing the result of mse is also the evluaate factor for depth estimate if train_loss / len(train_loader) < loss_best: loss_best = train_loss / len(train_loader) save_path = os.path.join(args.model_root, args.name + '.pth') torch.save(model.state_dict(), save_path) log_list.append(train_loss / len(train_loader)) print("Epoch {}, loss={:.5f}".format( epoch, train_loss / len(train_loader), )) train_loss = 0 if args.lrcos: if args.lr_warmup: scheduler_warmup.step(epoch=epoch) else: cos_scheduler.step(epoch=epoch) if epoch < 20: print(epoch, optimizer.param_groups[0]['lr']) # save model and para if epoch % save_interval == 0: train_state = { "Epoch": epoch, "model_state": model.state_dict(), "optim_state": optimizer.state_dict(), "args": args } models_dir = args.model_root + '/' + args.name + '.pt' torch.save(train_state, models_dir) # save log with open(log_dir, 'a+', newline='') as f: # 训练结果 my_writer = csv.writer(f) my_writer.writerow(log_list) log_list = [] epoch = epoch + 1 train_duration_sec = int(time.time() - start) print("training is end", train_duration_sec)
def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES)) if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = args.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() if args.augmentation: inputs, targets_a, targets_b, lam = mixup_data( inputs, targets, 0.6) inputs, targets_a, targets_b = map(Variable, (inputs, targets_a, targets_b)) else: inputs, targets = Variable(inputs), Variable(targets) _, _, _, _, outputs = net(inputs) if args.augmentation: loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam) else: loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() if args.augmentation: conf_mat_a += losses.confusion_matrix(outputs, targets_a, NUM_CLASSES) acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0]) ]) / conf_mat_a.sum() precision_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) recall_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) mAP_a = sum(precision_a) / len(precision_a) F1_score_a = (2 * precision_a * recall_a / (precision_a + recall_a + 1e-10)).mean() conf_mat_b += losses.confusion_matrix(outputs, targets_b, NUM_CLASSES) acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0]) ]) / conf_mat_b.sum() precision_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) recall_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) mAP_b = sum(precision_b) / len(precision_b) F1_score_b = (2 * precision_b * recall_b / (precision_b + recall_b + 1e-10)).mean() acc = lam * acc_a + (1 - lam) * acc_b mAP = lam * mAP_a + (1 - lam) * mAP_b F1_score = lam * F1_score_a + (1 - lam) * F1_score_b else: conf_mat += losses.confusion_matrix(outputs, targets, NUM_CLASSES) acc = sum([conf_mat[i, i] for i in range(conf_mat.shape[0])]) / conf_mat.sum() precision = [ conf_mat[i, i] / (conf_mat[i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] mAP = sum(precision) / len(precision) recall = [ conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] precision = np.array(precision) recall = np.array(recall) f1 = 2 * precision * recall / (precision + recall + 1e-10) F1_score = f1.mean() #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%' #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score)) return train_loss / (batch_idx + 1), 100. * acc, 100. * mAP, 100 * F1_score
def train_one_epoch(self, train_loader, val_loader): self.model.train() train_loss_sum, train_acc_sum = 0.0, 0.0 for img, label in train_loader: # if len(label) <= 1: # continue img, label = img.to(DEVICE), label.to(DEVICE) width, height = img.size(-1), img.size(-2) self.optimizer.zero_grad() if self.mix_up: img, labels_a, labels_b, lam = mixup_data(img, label, alpha=0.2) output = self.model(img) loss = mixup_criterion(self.criteration, output, labels_a, labels_b, lam) elif self.cutMix: img, targets = cutmix(img, label) target_a, target_b, lam = targets output = self.model(img) loss = self.criteration(output, target_a) * lam + self.criteration( output, target_b) * (1. - lam) elif self.fmix: data, target = fmix(img, label, alpha=1., decay_power=3., shape=(width, height)) targets, shuffled_targets, lam = target output = self.model(data) loss = self.criteration( output, targets) * lam + self.criteration( output, shuffled_targets) * (1 - lam) else: output = self.model(img) loss = self.criteration(output, label) loss.backward() _, preds = torch.max(output.data, 1) correct = (preds == label).sum().item() train_acc_sum += correct train_loss_sum += loss.item() self.optimizer.step() train_loss = train_loss_sum / len(train_loader.dataset) train_acc = train_acc_sum / len(train_loader.dataset) val_acc_sum = 0.0 valid_loss_sum = 0 self.model.eval() for val_img, val_label in val_loader: # if len(val_label) <= 1: # continue val_img, val_label = val_img.to(DEVICE), val_label.to(DEVICE) val_output = self.model(val_img) _, preds = torch.max(val_output.data, 1) correct = (preds == val_label).sum().item() val_acc_sum += correct loss = self.criteration(val_output, val_label) valid_loss_sum += loss.item() val_acc = val_acc_sum / len(val_loader.dataset) val_loss = valid_loss_sum / len(val_loader.dataset) return train_loss, train_acc, val_loss, val_acc
def train(epoch): print('\nEpoch: %d' % epoch) snet.train() if args.model == 'VID': VID_NET1.train() VID_NET2.train() elif args.model == 'OFD': OFD_NET1.train() OFD_NET2.train() elif args.model == 'AFD': AFD_NET1.train() AFD_NET2.train() else: pass train_loss = 0 train_cls_loss = 0 conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES)) if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = args.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (img_teacher, img_student, target) in enumerate(trainloader): if args.cuda: img_teacher = img_teacher.cuda(non_blocking=True) img_student = img_student.cuda(non_blocking=True) target = target.cuda(non_blocking=True) optimizer.zero_grad() if args.augmentation: img_teacher, teacher_target_a, teacher_target_b, teacher_lam = mixup_data( img_teacher, target, 0.6) img_teacher, teacher_target_a, teacher_target_b = map( Variable, (img_teacher, teacher_target_a, teacher_target_b)) img_student, student_target_a, student_target_b, student_lam = mixup_data( img_student, target, 0.6) img_student, student_target_a, student_target_b = map( Variable, (img_student, student_target_a, student_target_b)) else: img_teacher, img_student, target = Variable(img_teacher), Variable( img_student), Variable(target) rb1_s, rb2_s, rb3_s, mimic_s, out_s = snet(img_student) rb1_t, rb2_t, rb3_t, mimic_t, out_t = tnet(img_teacher) if args.augmentation: cls_loss = mixup_criterion(Cls_crit, out_s, student_target_a, student_target_b, student_lam) else: cls_loss = Cls_crit(out_s, target) kd_loss = KD_T_crit(out_t, out_s) if args.model == 'Fitnet': #FITNETS: Hints for Thin Deep Nets if args.stage == 'Block1': Fitnet1_loss = other.Fitnet(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * Fitnet1_loss elif args.stage == 'Block2': Fitnet2_loss = other.Fitnet(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * Fitnet2_loss else: Fitnet1_loss = other.Fitnet(rb1_t, rb1_s).cuda() Fitnet2_loss = other.Fitnet(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * Fitnet1_loss + args.delta * Fitnet2_loss elif args.model == 'AT': # An activation-based attention transfer with the sum of absolute values raised to the power of 2. #Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer if args.stage == 'Block1': AT1_loss = other.AT(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AT1_loss elif args.stage == 'Block2': AT2_loss = other.AT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AT2_loss else: AT1_loss = other.AT(rb1_t, rb1_s).cuda() AT2_loss = other.AT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AT1_loss + args.delta * AT2_loss elif args.model == 'NST': # NST (poly) #Like What You Like: Knowledge Distill via Neuron Selectivity Transfer if args.stage == 'Block1': NST1_loss = other.NST(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * NST1_loss elif args.stage == 'Block2': NST2_loss = other.NST(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * NST2_loss else: NST1_loss = other.NST(rb1_t, rb1_s).cuda() NST2_loss = other.NST(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * NST1_loss + args.delta * NST2_loss elif args.model == 'PKT': # PKT #Learning Deep Representations with Probabilistic Knowledge Transfer if args.stage == 'Block1': PKT1_loss = other.PKT(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * PKT1_loss elif args.stage == 'Block2': PKT2_loss = other.PKT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * PKT2_loss else: PKT1_loss = other.PKT(rb1_t, rb1_s).cuda() PKT2_loss = other.PKT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * PKT1_loss + args.delta * PKT2_loss elif args.model == 'AB': # AB #Knowledge Transfer via Distillation of Activation Boundaries Formed by Hidden Neurons if args.stage == 'Block1': AB1_loss = other.AB(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AB1_loss elif args.stage == 'Block2': AB2_loss = other.AB(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AB2_loss else: AB1_loss = other.AB(rb1_t, rb1_s).cuda() AB2_loss = other.AB(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AB1_loss + args.delta * AB2_loss elif args.model == 'CCKD': # #Correlation Congruence for Knowledge Distillation if args.stage == 'Block1': CCKD1_loss = other.CCKD().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * CCKD1_loss elif args.stage == 'Block2': CCKD2_loss = other.CCKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * CCKD2_loss else: CCKD1_loss = other.CCKD().cuda()(rb1_t, rb1_s) CCKD2_loss = other.CCKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * CCKD1_loss + args.delta * CCKD2_loss elif args.model == 'RKD': # RKD-DA #Relational Knowledge Disitllation if args.stage == 'Block1': RKD1_loss = other.RKD().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * RKD1_loss elif args.stage == 'Block2': RKD2_loss = other.RKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * RKD2_loss else: RKD1_loss = other.RKD().cuda()(rb1_t, rb1_s) RKD2_loss = other.RKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * RKD1_loss + args.delta * RKD2_loss elif args.model == 'SP': # SP #Similarity-Preserving Knowledge Distillation if args.stage == 'Block1': SP1_loss = other.SP().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * SP1_loss elif args.stage == 'Block2': SP2_loss = other.SP().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * SP2_loss else: SP1_loss = other.SP().cuda()(rb1_t, rb1_s) SP2_loss = other.SP().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * SP1_loss + args.delta * SP2_loss elif args.model == 'VID': # VID-I #Variational Information Distillation for Knowledge Transfer if args.stage == 'Block1': VID1_loss = VID_NET1(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VID1_loss elif args.stage == 'Block2': VID2_loss = VID_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * VID2_loss else: VID1_loss = VID_NET1(rb1_t, rb1_s) VID2_loss = VID_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VID1_loss + args.delta * VID2_loss elif args.model == 'OFD': # OFD #A Comprehensive Overhaul of Feature Distillation if args.stage == 'Block1': OFD1_loss = OFD_NET1(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * OFD1_loss elif args.stage == 'Block2': OFD2_loss = OFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * OFD2_loss else: OFD1_loss = OFD_NET1.cuda()(rb1_t, rb1_s) OFD2_loss = OFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * OFD1_loss + args.delta * OFD2_loss elif args.model == 'AFDS': # #Pay Attention to Features, Transfer Learn Faster CNNs if args.stage == 'Block1': AFD1_loss = AFD_NET1(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AFD1_loss elif args.stage == 'Block2': AFD2_loss = AFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AFD2_loss else: AFD1_loss = AFD_NET1(rb1_t, rb1_s) AFD2_loss = AFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AFD1_loss + args.delta * AFD2_loss elif args.model == 'FT': # #Paraphrasing Complex Network: Network Compression via Factor Transfer if args.stage == 'Block1': FT1_loss = other.FT().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * FT1_loss elif args.stage == 'Block2': FT2_loss = other.FT().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * FT2_loss else: FT1_loss = other.FT().cuda()(rb1_t, rb1_s) FT2_loss = other.FT().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * FT1_loss + args.delta * FT2_loss elif args.model == 'CD': # CD+GKD+CE #Channel Distillation: Channel-Wise Attention for Knowledge Distillation if args.stage == 'Block1': kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s, target) CD1_loss = other.CD().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.gamma * CD1_loss elif args.stage == 'Block2': kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s, target) CD2_loss = other.CD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.delta * CD2_loss else: kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s, target) CD1_loss = other.CD().cuda()(rb1_t, rb1_s) CD2_loss = other.CD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.gamma * CD1_loss + args.delta * CD2_loss elif args.model == 'FAKD': # DS+TS+SA #FAKD: Feature-Affinity Based Knowledge Distillation for Efficient Image Super-Resolution if args.stage == 'Block1': FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target, NUM_CLASSES) FAKD_SA1_loss = other.FAKD_SA().cuda()(rb1_t, rb1_s) loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA1_loss # No T elif args.stage == 'Block2': FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target, NUM_CLASSES) FAKD_SA2_loss = other.FAKD_SA().cuda()(rb2_t, rb2_s) loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA2_loss else: FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target, NUM_CLASSES) FAKD_SA1_loss = other.FAKD_SA().cuda()(rb1_t, rb1_s) FAKD_SA2_loss = other.FAKD_SA().cuda()(rb2_t, rb2_s) loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA1_loss + args.delta * FAKD_SA2_loss elif args.model == 'VKD': # #Robust Re-Identification by Multiple Views Knowledge Distillation if args.stage == 'Block1': VKD_Similarity1_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb1_t, rb1_s) VKD_OnlineTriplet1_loss = other.VKD_OnlineTripletLoss().cuda()( rb1_s, target) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity1_loss \ + args.delta * VKD_OnlineTriplet1_loss elif args.stage == 'Block2': VKD_Similarity2_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb2_t, rb2_s) VKD_OnlineTriplet2_loss = other.VKD_OnlineTripletLoss().cuda()( rb2_s, target) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity2_loss \ + args.delta * VKD_OnlineTriplet2_loss else: VKD_Similarity1_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb1_t, rb1_s) VKD_OnlineTriplet1_loss = other.VKD_OnlineTripletLoss().cuda()( rb1_s, target) VKD_Similarity2_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb2_t, rb2_s) VKD_OnlineTriplet2_loss = other.VKD_OnlineTripletLoss().cuda()( rb2_s, target) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity1_loss \ + args.delta * VKD_OnlineTriplet1_loss + args.gamma * VKD_Similarity2_loss \ + args.delta * VKD_OnlineTriplet2_loss elif args.model == 'RAD': # RAD: Resolution-Adapted Distillation # Efficient Low-Resolution Face Recognition via Bridge Distillation distance = mimic_t - mimic_s RAD_loss = torch.pow(distance, 2).sum(dim=(0, 1), keepdim=False) loss = RAD_loss + cls_loss else: raise Exception('Invalid model name...') loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() train_cls_loss += cls_loss.item() if args.augmentation: conf_mat_a += losses.confusion_matrix(out_s, student_target_a, NUM_CLASSES) acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0]) ]) / conf_mat_a.sum() precision_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) recall_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) mAP_a = sum(precision_a) / len(precision_a) F1_score_a = (2 * precision_a * recall_a / (precision_a + recall_a + 1e-10)).mean() conf_mat_b += losses.confusion_matrix(out_s, student_target_b, NUM_CLASSES) acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0]) ]) / conf_mat_b.sum() precision_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) recall_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) mAP_b = sum(precision_b) / len(precision_b) F1_score_b = (2 * precision_b * recall_b / (precision_b + recall_b + 1e-10)).mean() acc = student_lam * acc_a + (1 - student_lam) * acc_b mAP = student_lam * mAP_a + (1 - student_lam) * mAP_b F1_score = student_lam * F1_score_a + (1 - student_lam) * F1_score_b else: conf_mat += losses.confusion_matrix(out_s, target, NUM_CLASSES) acc = sum([conf_mat[i, i] for i in range(conf_mat.shape[0])]) / conf_mat.sum() precision = [ conf_mat[i, i] / (conf_mat[i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] mAP = sum(precision) / len(precision) recall = [ conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] precision = np.array(precision) recall = np.array(recall) f1 = 2 * precision * recall / (precision + recall + 1e-10) F1_score = f1.mean() #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%' #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score)) return train_cls_loss / (batch_idx + 1), 100. * acc, 100. * mAP, 100 * F1_score