def train(epoch): global acc, acc_train net.train() train_loss = 0 correct = 0 total = 0 if epoch > lr_dec_start and lr_dec_start >= 0: frac = (epoch - lr_dec_start) // lr_dec_every dec_frac = lr_dec_rate ** frac #0.9 ** e/5 curr_lr = lr * dec_frac #0.9 ** e/5 utils.set_lr(optimizer, curr_lr) else: curr_lr = lr # print(1) for batch_idx, (inputs, targets) in enumerate(tr_loader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += float(loss.item()) _, pred = torch.max(outputs.data, 1) total += targets.size(0) correct += pred.eq(targets.data).cpu().sum() # print(2) if epoch == total_epoch - 1: class_mat[0] = calcl_mat(pred.cpu().numpy(), targets.data.cpu().numpy(), class_mat[0]) print("training... batch:{} loss={} acc={}%({}/{})".format(batch_idx, train_loss/(batch_idx+1), 100.*correct/total,correct,total)) acc_train = 100.*correct / total hists[0].append(train_loss/(batch_idx+1)) hists[1].append(acc_train)
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate ** frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() if opt.mixup: inputs, targets_a, targets_b, lam = utils.mixup_data(inputs, targets, 0.6, True) inputs, targets_a, targets_b = map(Variable, (inputs, targets_a, targets_b)) else: inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) if opt.mixup: loss = utils.mixup_criterion(criterion, outputs, targets_a, targets_b, lam) else: loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) if opt.mixup: correct += (lam * predicted.eq(targets_a.data).cpu().sum().float() + (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float()) else: correct += predicted.eq(targets.data).cpu().sum() utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 100.*float(correct)/float(total), correct, total)) Train_acc = 100.*float(correct)/float(total) return train_loss/(batch_idx+1), Train_acc
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate ** frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): bs, c, h, w = np.shape(inputs) inputs = torch.Tensor(inputs) if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) conTensor1 = net1.features(inputs).view(bs, -1) conTensor2 = layers(inputs, net2).view(bs, -1) if use_cuda: conTensor1, conTensor2 = conTensor1.cuda(), conTensor2.cuda() resTensor = torch.cat((conTensor1, conTensor2), 1) #print(resTensor.shape) resTensor = Variable(resTensor) outputs = netClassifier(resTensor) #outputs_avg = outputs.view(bs, ncrops, -1).mean(1) #print(outputs.shape,"train output") loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.data.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() ''' utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) ''' Train_acc = float(100. * correct) / float(total) #if ((epoch + 1) % 10 == 0): Ta = float(Train_acc) temp_train_acc.append(Ta) temp_train_loss.append(train_loss)
def train(epoch): print('\nEpoch: %d' % epoch) print("Start Training!") global Train_acc net.train() train_loss = 0 correct = 0 total = 0 # 学习率下降 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) # 迭代,迭代的次数 = 28708(训练集大小)/batch_size = batch数,计算loss和accuracy # batch_idx计迭代数 iter_num = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # 正向传播 loss = criterion(outputs, targets) loss.backward() # 反向传播 utils.clip_gradient(optimizer, 0.1) optimizer.step() # train_loss += loss.data[0] train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) # 每次加上batch_size个数 correct += predicted.eq(targets.data).cpu().sum() Train_acc = int(correct) / int( total) # 也就是最后一次迭代的准确率,代表一个epoch的准确率(完整的数据集通过了神经网络) # 输出loss和训练集准确率 # utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' # % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) iter_num += 1 train_loss = train_loss / iter_num train_loss_list.append(train_loss) train_acc_list.append(Train_acc) print("Train Accuracy:", Train_acc * 100, "%") print("Train Loss:", train_loss)
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc global lr1 net.train() train_loss = 0 f_loss = 0.0 correct = 0 total = 0 current_lr1 = 0.0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac decay_factor1 = 0.95**frac current_lr = opt.lr * decay_factor current_lr1 = lr1 * decay_factor1 utils.set_lr(optimizer, current_lr) # set the decayed rate utils.set_lr(optimzer4center, current_lr1) else: current_lr = opt.lr current_lr1 = lr1 print('learning_rate: %s' % str(current_lr)) print('learning_rate1: %s' % str(current_lr1)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() optimzer4center.zero_grad() inputs, targets = Variable(inputs), Variable(targets) ip1, outputs = net(inputs) loss = nllloss(outputs, targets) + loss_weight * centerloss(targets, ip1) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() optimzer4center.step() train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() f_loss = float(train_loss) / float(batch_idx + 1) utils.progress_bar( batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100.0 * float(correct) / float(total), correct, total)) Train_acc = 100.0 * float(correct) / float(total) training_loss.append(f_loss) training_acc.append(Train_acc)
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES)) if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() conf_mat += utils.confusion_matrix(outputs, targets, NUM_CLASSES) acc = sum([conf_mat[i, i] for i in range(conf_mat.shape[0])]) / conf_mat.sum() uacc_per_class = [ conf_mat[i, i] / conf_mat[i].sum() for i in range(conf_mat.shape[0]) ] unweighted_acc = sum(uacc_per_class) / len(uacc_per_class) prec_per_class = [ conf_mat[i, i] / conf_mat[:, i].sum() for i in range(conf_mat.shape[0]) ] average_precision = sum(prec_per_class) / len(prec_per_class) utils.progress_bar( batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | unweighted_Acc: %.3f%%' % (train_loss / (batch_idx + 1), 100. * acc, 100. * unweighted_acc)) Train_acc = 100. * acc
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): # *********************process the weights including binarization********************* #print("!!!bin!!!\r\n") bin_op.binarization() if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) #*********************************************** outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() #utils.clip_gradient(optimizer, 0.1) # ****************restore weights***************** bin_op.restore() bin_op.updateBinaryGradWeight() if opt.sr: updateBN() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() utils.progress_bar( batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * float(correct) / total, correct, total)) Train_acc = 100. * float(correct) / total
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): bs, c, h, w = np.shape(inputs) spl = int(bs / 2) input1 = inputs[0:spl] label1 = targets[0:spl] input2 = inputs[spl:bs] label2 = targets[spl:bs] input1 = perturb(input1, label1, 0.2, net) input1 = torch.Tensor(input1) inputs = np.vstack((input1, input2)) inputs = torch.Tensor(inputs) if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.data.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() ''' utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) ''' Train_acc = 100. * correct / total if ((epoch + 1) % 10 == 0): Ta = float(Train_acc) temp_train_acc.append(Ta) temp_train_loss.append(train_loss)
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() ori = inputs # generate adversarial samples with ctx_noparamgrad_and_eval(net): inputs = adversary.perturb(inputs, targets) # concatenate with clean samples inputs = torch.cat((ori, inputs), 0) targets = torch.cat((targets, targets), 0) optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.data[0] _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() utils.progress_bar( batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) Train_acc = 100. * correct / total Train_loss = train_loss / len(trainloader.dataset) return Train_acc, Train_loss
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 err0 = 0.005 # for FGSM if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(1), targets.cuda(1) optimizer.zero_grad() inputs, targets = Variable(inputs, requires_grad=True), Variable(targets) outputs = net(inputs) loss = alpha * criterion(outputs, targets) loss.backward() train_loss += loss.data.item() del loss inputs_prime = inputs + err0 * torch.sign(inputs.grad) del inputs outputs_prime = net(inputs_prime) loss2 = (1 - alpha) * criterion(outputs_prime, targets) loss2.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss2.data.item() del loss2 _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum().numpy() utils.progress_bar( len(trainloader), len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) Train_acc = 100. * correct / total
def train(epoch): if detail: print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr if detail: print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() # train_loss += loss.data[0] train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() if detail: utils.progress_bar( batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) Train_acc = 100. * correct / total
def train(epoch, start_time): t = datetime.datetime.now() t_show = datetime.datetime.strftime(t, '%Y-%m-%d %H:%M:%S') print('Epoch: %d\t%s\t%s' % (epoch, t_show, utils.cal_run_time(start_time))) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %.6f' % current_lr) for batch_idx, (inputs, targets) in enumerate(train_loader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.data _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() utils.progress_bar( batch_idx, len(train_loader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100 * float(correct) / total, correct, total)) Train_acc = 100 * float(correct) / total
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc ##Global variable to store training accuracy. net.train() train_loss = 0 correct = 0 total = 0 ## learning rate decay tells us how fast we decrease our advancements in learning from our mistakes. As noted previously, it is ideal that the algorithm does not too stay suspicious for too long, as it ## might result in drastic changes which actually worsen the accuracy! As the algorithm becomes more advanced and knowledgable, it should trust its decisions more. if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate ** frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): ##A 'batch' is the amount of pictures that is handled each epoch. if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() ##Don't worry about optimizers for now... inputs, targets = Variable(inputs), Variable(targets) ##Grab the inputs and labels (targets)... outputs = net(inputs) ##Based on the input, determine the output of the algorithm... loss = criterion(outputs, targets) ## Calculate the 'loss' (how far are we off) based on the difference between the input and the output... loss.backward() ##Perform backwards propegation: Based on the loss, it can be determined what should be changed in the weights and biases of the algorithm. utils.clip_gradient(optimizer, 0.1) ##Calculate the 'gradient': This explains in which 'direction' the weights should be changed to minimize loss (according to the calculations from the current point) ## Moreover, it contains information about how large the effect of a small change in the individual weights and biases would presumably be on the loss. optimizer.step() ##Based on the gradient and using the optimizer algorithm, a 'step' is taken. This implies that all weights and biases are changed in such a way as to minimize the loss based on the gradient. ## The gradient only tells you something about the direction and magnitude of change IN A POINT --> this step is based on extrapolation. You might be able to see that the step size ## (which is dependant on the learning rate and) therefore tells us something about how much we dare to extrapolate in a certain point in the process, and why we lower this step size ## with increasing amounts of epochs. train_loss += loss.data[0] ##Add the loss value of this run to the total loss during training _, predicted = torch.max(outputs.data, 1) ##?? total += targets.size(0) ##?? correct += predicted.eq(targets.data).cpu().sum() ##?? utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' ##Show a progression bar that also indicated the current loss and average accuracy of the system. % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) Train_acc = 100.*correct/total
def train(epoch): print('\nEpoch: %d' % epoch) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate ** frac current_lr = opt.lr * decay_factor ##指数衰减。例如:随着迭代轮数的增加学习率自动发生衰减 utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() #反向传播 utils.clip_gradient(optimizer, 0.1) #梯度截断 optimizer.step() #用SGD更新参数 train_loss += np.array(loss.data) _, predicted = torch.max(outputs.data, 1) #返回每一行最大值的元素,并返回索引 total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() print('Epoch: %d | Batch_index: %d | Loss: %.3f | Acc: %.3f%% (%d/%d) ' %(epoch,batch_idx,train_loss/(batch_idx+1), 100.*correct/total, correct, total)) # utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' # % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) #每个批数据上的损失函数,训练集上一共有(28709/128)个批 Train_acc = 100.*correct/total
def train(self): print('\nEpoch: %d' % epoch) if epoch > 80: frac = (epoch - 80) // 5 decay_factor = 0.9**frac current_lr = opt.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = opt.lr print('learning_rate: %s' % str(current_lr)) train_loss = 0 total_loss = 0 total = 0 correct = 0 for train_x, train_y in train_data: train_x, train_y = train_x.cuda(), train_y.cuda() train_x, train_y = Variable(train_x), Variable(train_y) optimizer.zero_grad() output = model(train_x) loss = criterion(output, train_y) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.data[0] _, predicted = torch.max(output.data, 1) total += train_y.size(0) correct += predicted.eq(train_y.data).cpu().sum().item() train_acc = 100 * correct / total print(train_acc) print("Train_acc : %0.3f" % train_acc) if int(train_acc) == 2000: torch.save(model.state_dict(), './raf_train.t7')
print('Start Triplet Similarity Training.') errors_real_D = [] errors_fake_D = [] errors_D = [] errors_G = [] losses_triplet = [] loss_record = 0. best_loss = 10000. margin_change_count = 0 gen_iter = 0 bat_start = 0 for it in range(n_iters): print('\rTriplet iter %05d' % (it + 1), end='') if it in triplet_lrs.keys(): set_lr(triplet_lrs[it], optimizer) if it in triplet_margins.keys(): margin = triplet_margins[it] ''' Triplet Similarity Mearsuring ''' anchor, positive, negative = grab_triplet_batch( X, Y, person_cam_index, datareader.num_class, batch_size) x = Variable(torch.from_numpy(anchor.astype(float)).float().cuda()) anc_feat = model.extract(x) x = Variable(torch.from_numpy(positive.astype(float)).float().cuda()) pos_feat = model.extract(x) x = Variable(torch.from_numpy(negative.astype(float)).float().cuda()) neg_feat = model.extract(x)
weight_saver.restore() # Progress bar tpbar = tqdm(unit="batches", ncols=100, total=args.num_iterations) interval_cost = 0.0 # Declare lists for logging metrics if args.logfile is not None: train_result = [] test_result = [] err_result = [] # Iterating over the training set for step in range(args.num_iterations): feed_dict = fill_feed_dict(input_or_ph_ops=input_ops_train, dataset=train_set, learning_rate_ph=lr_ph, learning_rate_val=set_lr( base_lr, step, learning_schedule, gamma), step=step) # Mean batch cost output = train_function(feed_dict=feed_dict) # Update progress bar tpbar.update(1) tpbar.set_description("Training {:0.4f}".format(output[()])) interval_cost += output[()] # Every epoch print test set metrics if (step + 1) % args.iter_interval == 0 and step > 0: # Call loop_eval to calculate metric over test set eval_losses = loop_eval(valid_set, input_ops_valid, metric_names, eval_function, en_top5) tqdm.write(
def run(self, total_steps): """ Runs PPO Args: total_steps (int): total number of environment steps to run for """ N = self.num_workers T = self.worker_steps E = self.opt_epochs A = self.venv.action_space.n while self.taken_steps < total_steps: progress = self.taken_steps / total_steps obs, rewards, masks, actions, steps = self.interact() ob_shape = obs.size()[2:] ep_reward = self.test() self.reward_histr.append(ep_reward) self.steps_histr.append(self.taken_steps) # statistic logic group_size = len(self.steps_histr) // self.plot_points if self.plot_reward and len(self.steps_histr) % ( self.plot_points * 10) == 0 and group_size >= 10: x_means, _, y_means, y_stds = \ mean_std_groups(np.array(self.steps_histr), np.array(self.reward_histr), group_size) fig = plt.figure() fig.set_size_inches(8, 6) plt.ticklabel_format(axis='x', style='sci', scilimits=(-2, 6)) plt.errorbar(x_means, y_means, yerr=y_stds, ecolor='xkcd:blue', fmt='xkcd:black', capsize=5, elinewidth=1.5, mew=1.5, linewidth=1.5) plt.title('Training progress') plt.xlabel('Total steps') plt.ylabel('Episode reward') plt.savefig(self.plot_path, dpi=200) plt.clf() plt.close() plot_timer = 0 # TEMP upgrade to support recurrence # compute advantages, returns with GAE obs_ = obs.view(((T + 1) * N, ) + ob_shape) obs_ = Variable(obs_) _, values = self.policy(obs_) values = values.view(T + 1, N, 1) advantages, returns = gae(rewards, masks, values, self.gamma, self.lambd) self.policy_old.load_state_dict(self.policy.state_dict()) for e in range(E): self.policy.zero_grad() MB = steps // self.minibatch_steps b_obs = Variable(obs[:T].view((steps, ) + ob_shape)) b_rewards = Variable(rewards.view(steps, 1)) b_masks = Variable(masks.view(steps, 1)) b_actions = Variable(actions.view(steps, 1)) b_advantages = Variable(advantages.view(steps, 1)) b_returns = Variable(returns.view(steps, 1)) b_inds = np.arange(steps) np.random.shuffle(b_inds) for start in range(0, steps, self.minibatch_steps): mb_inds = b_inds[start:start + self.minibatch_steps] mb_inds = cuda_if( torch.from_numpy(mb_inds).long(), self.cuda) mb_obs, mb_rewards, mb_masks, mb_actions, mb_advantages, mb_returns = \ [arr[mb_inds] for arr in [b_obs, b_rewards, b_masks, b_actions, b_advantages, b_returns]] mb_pis, mb_vs = self.policy(mb_obs) mb_pi_olds, mb_v_olds = self.policy_old(mb_obs) mb_pi_olds, mb_v_olds = mb_pi_olds.detach( ), mb_v_olds.detach() losses = self.objective(self.clip_func(progress), mb_pis, mb_vs, mb_pi_olds, mb_v_olds, mb_actions, mb_advantages, mb_returns) policy_loss, value_loss, entropy_loss = losses loss = policy_loss + value_loss * self.value_coef + entropy_loss * self.entropy_coef set_lr(self.optimizer, self.lr_func(progress)) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(self.policy.parameters(), self.max_grad_norm) self.optimizer.step() self.taken_steps += steps print(self.taken_steps)
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) #torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled=True torch.cuda.manual_seed(args.seed) logging.info('gpu device num = %d' % args.ngpu) logging.info("args = %s", args) genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype, args.residual_wei, args.shrink_channel) if args.parallel: model = nn.DataParallel(model).cuda() #model = nn.parallel.DistributedDataParallel(model).cuda() else: model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD( #model.parameters(), utils.set_group_weight(model, args.bn_no_wd, args.bias_no_wd), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) resume = os.path.join(args.save, 'checkpoint.pth.tar') if os.path.exists(resume): print("=> loading checkpoint %s" % resume) #checkpoint = torch.load(resume) checkpoint = torch.load(resume, map_location = lambda storage, loc: storage.cuda(0)) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) #optimizer.load_state_dict(checkpoint['optimizer']) optimizer.state_dict()['state'] = checkpoint['optimizer']['state'] print('=> loaded checkpoint epoch %d' % args.start_epoch) if args.start_epoch >= args.epochs: print('training finished') sys.exit(0) traindir = os.path.join(args.data, 'train') validdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_data = dset.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1), transforms.ToTensor(), normalize, ])) valid_data = dset.ImageFolder( validdir, transforms.Compose([ transforms.Resize(int((256.0 / 224) * args.image_size)), transforms.CenterCrop(args.image_size), transforms.ToTensor(), normalize, ])) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=nworker) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=nworker) best_acc_top1 = 0 for epoch in range(args.start_epoch, args.epochs): if args.lr_strategy == 'cos': lr = utils.set_lr(optimizer, epoch, args.epochs, args.learning_rate) #elif args.lr_strategy == 'step': # scheduler.step() # lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) if args.parallel: model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs else: model.drop_path_prob = args.drop_path_prob * epoch / args.epochs train_acc, train_obj = train(train_queue, model, criterion_smooth, optimizer, epoch) logging.info('train_acc %f', train_acc) utils.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': train_acc, 'optimizer' : optimizer.state_dict(), }, False, args.save) #if epoch >= args.early_stop: # break valid_acc_top1, valid_acc_top5, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5)
def run(net, loader, optimizer, scheduler, tracker, train=False, has_answers=True, prefix='', epoch=0): """ Run an epoch over the given loader """ assert not (train and not has_answers) if train: net.train() tracker_class, tracker_params = tracker.MovingMeanMonitor, { 'momentum': 0.99 } else: net.eval() tracker_class, tracker_params = tracker.MeanMonitor, {} answ = [] idxs = [] accs = [] # set learning rate decay policy if epoch < len(config.gradual_warmup_steps ) and config.schedule_method == 'warm_up': utils.set_lr(optimizer, config.gradual_warmup_steps[epoch]) utils.print_lr(optimizer, prefix, epoch) elif (epoch in config.lr_decay_epochs ) and train and config.schedule_method == 'warm_up': utils.decay_lr(optimizer, config.lr_decay_rate) utils.print_lr(optimizer, prefix, epoch) else: utils.print_lr(optimizer, prefix, epoch) loader = tqdm(loader, desc='{} E{:03d}'.format(prefix, epoch), ncols=0) loss_tracker = tracker.track('{}_loss'.format(prefix), tracker_class(**tracker_params)) acc_tracker = tracker.track('{}_acc'.format(prefix), tracker_class(**tracker_params)) for v, q, a, b, idx, v_mask, q_mask, q_len in loader: var_params = { 'requires_grad': False, } v = Variable(v.cuda(), **var_params) q = Variable(q.cuda(), **var_params) a = Variable(a.cuda(), **var_params) b = Variable(b.cuda(), **var_params) q_len = Variable(q_len.cuda(), **var_params) v_mask = Variable(v_mask.cuda(), **var_params) q_mask = Variable(q_mask.cuda(), **var_params) out = net(v, b, q, v_mask, q_mask, q_len) if has_answers: answer = utils.process_answer(a) loss = utils.calculate_loss(answer, out, method=config.loss_method) acc = utils.batch_accuracy(out, answer).data.cpu() if train: optimizer.zero_grad() loss.backward() # print gradient if config.print_gradient: utils.print_grad([(n, p) for n, p in net.named_parameters() if p.grad is not None]) # clip gradient clip_grad_norm_(net.parameters(), config.clip_value) optimizer.step() if (config.schedule_method == 'batch_decay'): scheduler.step() else: # store information about evaluation of this minibatch _, answer = out.data.cpu().max(dim=1) answ.append(answer.view(-1)) if has_answers: accs.append(acc.view(-1)) idxs.append(idx.view(-1).clone()) if has_answers: loss_tracker.append(loss.item()) acc_tracker.append(acc.mean()) fmt = '{:.4f}'.format loader.set_postfix(loss=fmt(loss_tracker.mean.value), acc=fmt(acc_tracker.mean.value)) if not train: answ = list(torch.cat(answ, dim=0)) if has_answers: accs = list(torch.cat(accs, dim=0)) else: accs = [] idxs = list(torch.cat(idxs, dim=0)) #print('{} E{:03d}:'.format(prefix, epoch), ' Total num: ', len(accs)) #print('{} E{:03d}:'.format(prefix, epoch), ' Average Score: ', float(sum(accs) / len(accs))) return answ, accs, idxs
# Progress bar tpbar = tqdm(unit="batches", ncols=100, total=args.num_iterations) # Set interval cost to 0.0 interval_cost = 0.0 # Declare lists for logging metrics if (args.logfile is not None): train_result = [] test_result = [] err_result = [] # Iterating over the training set for step, data in enumerate(train_set): data['iteration'] = step # Dictionary for training feed_dict = {input_ph[k]: data[k] for k in input_ph.keys()} # Learning Schedule feed_dict[lr_ph] = set_lr(base_lr, step, learning_schedule, gamma) # Mean batch cost output = train_function(feed_dict=feed_dict) # Update progress bar tpbar.update(1) tpbar.set_description("Training {:0.4f}".format(output[()])) interval_cost += output[()] # Every epoch print test set metrics if (step + 1) % args.iter_interval == 0 and step > 0: # Call loop_eval to calculate metric over test set eval_losses = loop_eval(valid_set, input_ph, metric_names, eval_function, en_top5) tqdm.write( "Interval {interval} Iteration {iteration} complete. " "Avg Train Cost {cost:0.4f} Test Metrics:{tcost}".format( interval=step // args.iter_interval,
def train(args, model_id, tb): torch.manual_seed(args.seed) np.random.seed(args.seed) train_data = MedicalEasyEnsembleDataloader(args.train_data, args.class_id, args.batch_size, True, args.num_workers) val_data = MedicalEasyEnsembleDataloader(args.val_data, args.class_id, args.batch_size, False, args.num_workers) if os.path.exists(args.w2v_file): embedding = utils.load_embedding(args.w2v_file, vocab_size=args.vocab_size, embedding_size=args.embedding_size) else: embedding = None if args.model_type == 'lstm': model = models.LSTMModel(args, embedding) elif args.model_type == 'conv': model = models.ConvModel(args, embedding) elif args.model_type == 'char': model = models.CharCNNModel(args, embedding) elif args.model_type == 'base': model = models.BaseModel(args, embedding) else: raise NotImplementedError if os.path.isfile( os.path.join(args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "model_%d.pth" % model_id)): print("Load %d class %s type %dth model from previous step" % (args.class_id, args.model_type, model_id)) model.load_state_dict( torch.load( os.path.join(args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "model_%d.pth" % model_id))) iteration = 0 model = model.cuda(args.device) model.train() optimizer = utils.build_optimizer(args, model) loss_func = MultiBceLoss() cur_worse = 1000 bad_times = 0 for epoch in range(args.epochs): if epoch >= args.start_epoch: factor = (epoch - args.start_epoch) // args.decay_every decay_factor = args.decay_rate**factor current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # if epoch != 0 and epoch % args.sample_every == 0: # train_data.re_sample() for i, data in enumerate(train_data): tmp = [ _.cuda(args.device) if isinstance(_, torch.Tensor) else _ for _ in data ] report_ids, sentence_ids, sentence_lengths, output_vec = tmp optimizer.zero_grad() loss = loss_func(model(sentence_ids, sentence_lengths), output_vec) loss.backward() train_loss = loss.item() optimizer.step() iteration += 1 if iteration % args.print_every == 0: print("iter %d epoch %d loss: %.3f" % (iteration, epoch, train_loss)) if iteration % args.save_every == 0: torch.save( model.state_dict(), os.path.join(args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "model_%d.pth" % model_id)) with open(os.path.join(args.checkpoint_path, str(args.class_id), "config.json"), 'w', encoding='utf-8') as config_f: json.dump(vars(args), config_f, indent=2) with open(os.path.join( args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "config.json"), 'w', encoding='utf-8') as config_f: json.dump(vars(args), config_f, indent=2) if iteration % args.val_every == 0: val_loss = eval_model(model, loss_func, val_data, epoch) tb.add_scalar("model_%d val_loss" % model_id, val_loss, iteration) if val_loss > cur_worse: print("Bad Time Appear") cur_worse = val_loss bad_times += 1 else: cur_worse = val_loss bad_times = 0 if bad_times > args.patient: print('Early Stop !!!!') return if iteration % args.loss_log_every == 0: tb.add_scalar("model_%d train_loss" % model_id, loss.item(), iteration) print("The train finished")
def train(epoch): print('\nEpoch: %d' % epoch) snet.train() if args.model == 'VID': VID_NET1.train() VID_NET2.train() elif args.model == 'OFD': OFD_NET1.train() OFD_NET2.train() elif args.model == 'AFD': AFD_NET1.train() AFD_NET2.train() else: pass train_loss = 0 train_cls_loss = 0 conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES)) if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = args.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (img_teacher, img_student, target) in enumerate(trainloader): if args.cuda: img_teacher = img_teacher.cuda(non_blocking=True) img_student = img_student.cuda(non_blocking=True) target = target.cuda(non_blocking=True) optimizer.zero_grad() if args.augmentation: img_teacher, teacher_target_a, teacher_target_b, teacher_lam = mixup_data( img_teacher, target, 0.6) img_teacher, teacher_target_a, teacher_target_b = map( Variable, (img_teacher, teacher_target_a, teacher_target_b)) img_student, student_target_a, student_target_b, student_lam = mixup_data( img_student, target, 0.6) img_student, student_target_a, student_target_b = map( Variable, (img_student, student_target_a, student_target_b)) else: img_teacher, img_student, target = Variable(img_teacher), Variable( img_student), Variable(target) rb1_s, rb2_s, rb3_s, mimic_s, out_s = snet(img_student) rb1_t, rb2_t, rb3_t, mimic_t, out_t = tnet(img_teacher) if args.augmentation: cls_loss = mixup_criterion(Cls_crit, out_s, student_target_a, student_target_b, student_lam) else: cls_loss = Cls_crit(out_s, target) kd_loss = KD_T_crit(out_t, out_s) if args.model == 'Fitnet': #FITNETS: Hints for Thin Deep Nets if args.stage == 'Block1': Fitnet1_loss = other.Fitnet(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * Fitnet1_loss elif args.stage == 'Block2': Fitnet2_loss = other.Fitnet(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * Fitnet2_loss else: Fitnet1_loss = other.Fitnet(rb1_t, rb1_s).cuda() Fitnet2_loss = other.Fitnet(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * Fitnet1_loss + args.delta * Fitnet2_loss elif args.model == 'AT': # An activation-based attention transfer with the sum of absolute values raised to the power of 2. #Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer if args.stage == 'Block1': AT1_loss = other.AT(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AT1_loss elif args.stage == 'Block2': AT2_loss = other.AT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AT2_loss else: AT1_loss = other.AT(rb1_t, rb1_s).cuda() AT2_loss = other.AT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AT1_loss + args.delta * AT2_loss elif args.model == 'NST': # NST (poly) #Like What You Like: Knowledge Distill via Neuron Selectivity Transfer if args.stage == 'Block1': NST1_loss = other.NST(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * NST1_loss elif args.stage == 'Block2': NST2_loss = other.NST(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * NST2_loss else: NST1_loss = other.NST(rb1_t, rb1_s).cuda() NST2_loss = other.NST(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * NST1_loss + args.delta * NST2_loss elif args.model == 'PKT': # PKT #Learning Deep Representations with Probabilistic Knowledge Transfer if args.stage == 'Block1': PKT1_loss = other.PKT(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * PKT1_loss elif args.stage == 'Block2': PKT2_loss = other.PKT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * PKT2_loss else: PKT1_loss = other.PKT(rb1_t, rb1_s).cuda() PKT2_loss = other.PKT(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * PKT1_loss + args.delta * PKT2_loss elif args.model == 'AB': # AB #Knowledge Transfer via Distillation of Activation Boundaries Formed by Hidden Neurons if args.stage == 'Block1': AB1_loss = other.AB(rb1_t, rb1_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AB1_loss elif args.stage == 'Block2': AB2_loss = other.AB(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AB2_loss else: AB1_loss = other.AB(rb1_t, rb1_s).cuda() AB2_loss = other.AB(rb2_t, rb2_s).cuda() loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AB1_loss + args.delta * AB2_loss elif args.model == 'CCKD': # #Correlation Congruence for Knowledge Distillation if args.stage == 'Block1': CCKD1_loss = other.CCKD().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * CCKD1_loss elif args.stage == 'Block2': CCKD2_loss = other.CCKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * CCKD2_loss else: CCKD1_loss = other.CCKD().cuda()(rb1_t, rb1_s) CCKD2_loss = other.CCKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * CCKD1_loss + args.delta * CCKD2_loss elif args.model == 'RKD': # RKD-DA #Relational Knowledge Disitllation if args.stage == 'Block1': RKD1_loss = other.RKD().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * RKD1_loss elif args.stage == 'Block2': RKD2_loss = other.RKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * RKD2_loss else: RKD1_loss = other.RKD().cuda()(rb1_t, rb1_s) RKD2_loss = other.RKD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * RKD1_loss + args.delta * RKD2_loss elif args.model == 'SP': # SP #Similarity-Preserving Knowledge Distillation if args.stage == 'Block1': SP1_loss = other.SP().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * SP1_loss elif args.stage == 'Block2': SP2_loss = other.SP().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * SP2_loss else: SP1_loss = other.SP().cuda()(rb1_t, rb1_s) SP2_loss = other.SP().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * SP1_loss + args.delta * SP2_loss elif args.model == 'VID': # VID-I #Variational Information Distillation for Knowledge Transfer if args.stage == 'Block1': VID1_loss = VID_NET1(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VID1_loss elif args.stage == 'Block2': VID2_loss = VID_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * VID2_loss else: VID1_loss = VID_NET1(rb1_t, rb1_s) VID2_loss = VID_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VID1_loss + args.delta * VID2_loss elif args.model == 'OFD': # OFD #A Comprehensive Overhaul of Feature Distillation if args.stage == 'Block1': OFD1_loss = OFD_NET1(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * OFD1_loss elif args.stage == 'Block2': OFD2_loss = OFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * OFD2_loss else: OFD1_loss = OFD_NET1.cuda()(rb1_t, rb1_s) OFD2_loss = OFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * OFD1_loss + args.delta * OFD2_loss elif args.model == 'AFDS': # #Pay Attention to Features, Transfer Learn Faster CNNs if args.stage == 'Block1': AFD1_loss = AFD_NET1(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AFD1_loss elif args.stage == 'Block2': AFD2_loss = AFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AFD2_loss else: AFD1_loss = AFD_NET1(rb1_t, rb1_s) AFD2_loss = AFD_NET2(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AFD1_loss + args.delta * AFD2_loss elif args.model == 'FT': # #Paraphrasing Complex Network: Network Compression via Factor Transfer if args.stage == 'Block1': FT1_loss = other.FT().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * FT1_loss elif args.stage == 'Block2': FT2_loss = other.FT().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * FT2_loss else: FT1_loss = other.FT().cuda()(rb1_t, rb1_s) FT2_loss = other.FT().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * FT1_loss + args.delta * FT2_loss elif args.model == 'CD': # CD+GKD+CE #Channel Distillation: Channel-Wise Attention for Knowledge Distillation if args.stage == 'Block1': kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s, target) CD1_loss = other.CD().cuda()(rb1_t, rb1_s) loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.gamma * CD1_loss elif args.stage == 'Block2': kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s, target) CD2_loss = other.CD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.delta * CD2_loss else: kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s, target) CD1_loss = other.CD().cuda()(rb1_t, rb1_s) CD2_loss = other.CD().cuda()(rb2_t, rb2_s) loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.gamma * CD1_loss + args.delta * CD2_loss elif args.model == 'FAKD': # DS+TS+SA #FAKD: Feature-Affinity Based Knowledge Distillation for Efficient Image Super-Resolution if args.stage == 'Block1': FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target, NUM_CLASSES) FAKD_SA1_loss = other.FAKD_SA().cuda()(rb1_t, rb1_s) loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA1_loss # No T elif args.stage == 'Block2': FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target, NUM_CLASSES) FAKD_SA2_loss = other.FAKD_SA().cuda()(rb2_t, rb2_s) loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA2_loss else: FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target, NUM_CLASSES) FAKD_SA1_loss = other.FAKD_SA().cuda()(rb1_t, rb1_s) FAKD_SA2_loss = other.FAKD_SA().cuda()(rb2_t, rb2_s) loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA1_loss + args.delta * FAKD_SA2_loss elif args.model == 'VKD': # #Robust Re-Identification by Multiple Views Knowledge Distillation if args.stage == 'Block1': VKD_Similarity1_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb1_t, rb1_s) VKD_OnlineTriplet1_loss = other.VKD_OnlineTripletLoss().cuda()( rb1_s, target) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity1_loss \ + args.delta * VKD_OnlineTriplet1_loss elif args.stage == 'Block2': VKD_Similarity2_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb2_t, rb2_s) VKD_OnlineTriplet2_loss = other.VKD_OnlineTripletLoss().cuda()( rb2_s, target) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity2_loss \ + args.delta * VKD_OnlineTriplet2_loss else: VKD_Similarity1_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb1_t, rb1_s) VKD_OnlineTriplet1_loss = other.VKD_OnlineTripletLoss().cuda()( rb1_s, target) VKD_Similarity2_loss = other.VKD_SimilarityDistillationLoss( ).cuda()(rb2_t, rb2_s) VKD_OnlineTriplet2_loss = other.VKD_OnlineTripletLoss().cuda()( rb2_s, target) loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity1_loss \ + args.delta * VKD_OnlineTriplet1_loss + args.gamma * VKD_Similarity2_loss \ + args.delta * VKD_OnlineTriplet2_loss elif args.model == 'RAD': # RAD: Resolution-Adapted Distillation # Efficient Low-Resolution Face Recognition via Bridge Distillation distance = mimic_t - mimic_s RAD_loss = torch.pow(distance, 2).sum(dim=(0, 1), keepdim=False) loss = RAD_loss + cls_loss else: raise Exception('Invalid model name...') loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() train_cls_loss += cls_loss.item() if args.augmentation: conf_mat_a += losses.confusion_matrix(out_s, student_target_a, NUM_CLASSES) acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0]) ]) / conf_mat_a.sum() precision_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) recall_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) mAP_a = sum(precision_a) / len(precision_a) F1_score_a = (2 * precision_a * recall_a / (precision_a + recall_a + 1e-10)).mean() conf_mat_b += losses.confusion_matrix(out_s, student_target_b, NUM_CLASSES) acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0]) ]) / conf_mat_b.sum() precision_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) recall_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) mAP_b = sum(precision_b) / len(precision_b) F1_score_b = (2 * precision_b * recall_b / (precision_b + recall_b + 1e-10)).mean() acc = student_lam * acc_a + (1 - student_lam) * acc_b mAP = student_lam * mAP_a + (1 - student_lam) * mAP_b F1_score = student_lam * F1_score_a + (1 - student_lam) * F1_score_b else: conf_mat += losses.confusion_matrix(out_s, target, NUM_CLASSES) acc = sum([conf_mat[i, i] for i in range(conf_mat.shape[0])]) / conf_mat.sum() precision = [ conf_mat[i, i] / (conf_mat[i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] mAP = sum(precision) / len(precision) recall = [ conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] precision = np.array(precision) recall = np.array(recall) f1 = 2 * precision * recall / (precision + recall + 1e-10) F1_score = f1.mean() #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%' #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score)) return train_cls_loss / (batch_idx + 1), 100. * acc, 100. * mAP, 100 * F1_score
def train_model(model, dataloaders, criterion, optimizer, start_epoch, num_epochs=args.epochs): ''' Train model model: Model dataloaders: dataloader dict: {train: , val: } criterion: Loss function optimizer: Optimizer for training num_epochs: Number of epochs to train Out: Best model, val_acc_history ''' since = time.time() val_acc_history = [] lr = args.lr best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 learning_rate_decay_start = args.lr_decay_start learning_rate_decay_every = args.lr_decay_every learning_rate_decay_rate = args.lr_decay_rate for epoch in range(start_epoch, num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print("-" * 10) if epoch > learning_rate_decay_start and learning_rate_decay_every > 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = lr * decay_factor set_lr(optimizer, current_lr) print("Learning rate: ", current_lr) for phase in ["train", "val"]: if phase == "train": model.train() else: model.eval() running_loss = 0.0 running_corrects = 0 for inputs, labels in dataloaders[phase]: t = inputs.size(0) if phase == "val": bs, ncrops, c, h, w = np.shape(inputs) inputs = inputs.view(-1, c, h, w) #(bs*n_crops, c, h, w) inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) if phase == "val": outputs = outputs.view(bs, ncrops, -1).mean(1) loss = criterion(outputs, labels) _, preds = torch.max(outputs, 1) if phase == 'train': loss.backward() clip_gradient(optimizer, 0.1) optimizer.step() running_loss += loss.item() * t running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / (dataloader_length[phase]) epoch_acc = running_corrects.double() / (dataloader_length[phase]) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) if phase == 'val': val_acc_history.append(epoch_acc) #save_checkpoint(epoch, best_model_wts, optimizer) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) model.load_state_dict(best_model_wts) return model, val_acc_history
def train(args): logging.info("Create train_loader and val_loader.........") train_loader_kwargs = { 'question_pt': args.train_question_pt, 'vocab_json': args.vocab_json, 'feature_h5': args.train_feature_h5, 'batch_size': args.batch_size, 'num_workers': 4, 'shuffle': True } train_loader = CLEVRDataLoader(**train_loader_kwargs) if args.val: val_loader_kwargs = { 'question_pt': args.val_question_pt, 'vocab_json': args.vocab_json, 'feature_h5': args.val_feature_h5, 'batch_size': args.batch_size, 'num_workers': 2, 'shuffle': False } val_loader = CLEVRDataLoader(**val_loader_kwargs) logging.info("Create model.........") device = 'cuda' if torch.cuda.is_available() else 'cpu' model_kwargs = { 'vocab': train_loader.vocab, 'dim_word': args.dim_word, 'dim_hidden': args.hidden_size, 'dim_vision': args.dim_vision, 'state_size': args.state_size, 'mid_size': args.mid_size, 'dropout_prob': args.dropout, 'glimpses': args.glimpses, 'dim_edge': args.dim_edge } model_kwargs_tosave = { k:v for k,v in model_kwargs.items() if k != 'vocab' } model = Net(**model_kwargs) if torch.cuda.device_count() > 1: model = nn.DataParallel(model).to(device) # Support multiple GPUS else: model = model.to(device) logging.info(model) ################################################################ parameters = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adamax(parameters, args.lr, weight_decay=0) start_epoch = 0 if args.restore: print("Restore checkpoint and optimizer...") ckpt = os.path.join(args.save_dir, 'model.pt') ckpt = torch.load(ckpt, map_location={'cuda:0': 'cpu'}) start_epoch = 4 if torch.cuda.device_count() > 1: model.module.load_state_dict(ckpt['state_dict']) else: model.load_state_dict(ckpt['state_dict']) # optimizer.load_state_dict(ckpt['optimizer']) # scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.5**(1 / args.lr_halflife)) # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[8, 12, 15, 17, 19, 22], gamma=0.5) # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5) gradual_warmup_steps = [0.25 * args.lr, 0.5 * args.lr, 0.75 * args.lr, 1.0 * args.lr] criterion = nn.CrossEntropyLoss().to(device) last_acc = 0. logging.info("Start training........") for epoch in range(start_epoch, args.num_epoch): model.train() if epoch < len(gradual_warmup_steps): utils.set_lr(optimizer, gradual_warmup_steps[epoch]) else: scheduler.step() for p in optimizer.param_groups: lr_rate = p['lr'] logging.info("Learning rate: %6f" % (lr_rate)) for i, batch in enumerate(train_loader): progress = epoch+i/len(train_loader) orig_idx, image_idx, answers, *batch_input = [todevice(x, device) for x in batch] batch_input = [x.detach() for x in batch_input] logits, loss_time = model(*batch_input) ##################### loss ##################### ce_loss = criterion(logits, answers) loss_time = 0.01 * loss_time.mean() loss = ce_loss + loss_time ################################################ optimizer.zero_grad() loss.backward() nn.utils.clip_grad_value_(parameters, clip_value=0.25) optimizer.step() if (i+1) % (len(train_loader) // 20) == 0: logging.info("Progress %.3f ce_loss = %.3f time_loss = %.3f" % (progress, ce_loss.item(), loss_time.item())) del answers, batch_input, logits torch.cuda.empty_cache() # save_checkpoint(epoch, model, optimizer, model_kwargs_tosave, os.path.join(args.save_dir, 'model.pt')) logging.info(' >>>>>> save to %s <<<<<<' % (args.save_dir)) if args.val: if epoch % 1 ==0: valid_acc = validate(model, val_loader, device) logging.info('\n ~~~~~~ Valid Accuracy: %.4f ~~~~~~~\n' % valid_acc) if valid_acc >= last_acc: last_acc = valid_acc save_checkpoint(epoch, model, optimizer, model_kwargs_tosave, os.path.join(args.save_dir, 'model.pt'))
infos['iter'] = 0 infos['epoch'] = 0 iteration = infos['iter'] start_epoch = infos['epoch'] for e in range(start_epoch, start_epoch + 100): # Training with cross-entropy # learning rate decay if e >= 3: current_lr = opt.learning_rate * (opt.learning_rate_decay_rate**int( (e - 3) // opt.learning_rate_decay_every + 1)) else: current_lr = opt.learning_rate if e == 20: break set_lr(optimizer, current_lr) running_loss = 0. re_sort_net.train() with tqdm(desc='Epoch %d - train' % e, ncols=150, unit='it', total=len(iter(dataloader_train))) as pbar: for it, (keys, values) in enumerate(iter(dataloader_train)): detections = keys # b_s, 100, feat det_seqs_v, det_seqs_sr, control_verb, \ gt_seqs_v, gt_seqs_sr, _, _, captions = values optimizer.zero_grad() # batch_verb, batch_det_sr, batch_gt_sr index = 0 for i in range(detections.size(0)): # batch
if args.resume_path is not None: resume_ckpt = torch.load(args.resume_path) model.load_state_dict(resume_ckpt['state_dict']) # average.load_state_dict(resume_ckpt['average']) classifier.load_state_dict(resume_ckpt['classifier']) args.begin_epoch = resume_ckpt['epoch'] + 1 best_acc = resume_ckpt['best_acc'] optimizer.load_state_dict(resume_ckpt['optimizer']) print('==> Resume training...') print('best acc is: {}'.format(best_acc)) del resume_ckpt torch.cuda.empty_cache() set_lr(optimizer, 0.03) resume = True else: print('==> Train from sratch...') resume = False best_acc = 0 print('==> loading pre-trained model and NCE') ckpt = torch.load(args.pretrain_path) try: model.load_state_dict(ckpt['state_dict']) except: print( '=> [Warning]: weight structure is not equal to test model; Use non-equal load ==' ) model = neq_load_customized(model, ckpt['state_dict']) # average.load_state_dict(ckpt['average'])
def run(net, loader, optimizer, scheduler, tracker, train=False, prefix='', epoch=0): """ Run an epoch over the given loader """ if train: net.train() # tracker_class, tracker_params = tracker.MovingMeanMonitor, {'momentum': 0.99} else: net.eval() tracker_class, tracker_params = tracker.MeanMonitor, {} # set learning rate decay policy if epoch < len(config.gradual_warmup_steps ) and config.schedule_method == 'warm_up': utils.set_lr(optimizer, config.gradual_warmup_steps[epoch]) elif (epoch in config.lr_decay_epochs ) and train and config.schedule_method == 'warm_up': utils.decay_lr(optimizer, config.lr_decay_rate) utils.print_lr(optimizer, prefix, epoch) loader = tqdm(loader, desc='{} E{:03d}'.format(prefix, epoch), ncols=0) loss_tracker = tracker.track('{}_loss'.format(prefix), tracker_class(**tracker_params)) acc_tracker = tracker.track('{}_acc'.format(prefix), tracker_class(**tracker_params)) for v, q, a, b, idx, v_mask, q_mask, q_len in loader: var_params = { 'requires_grad': False, } v = Variable(v.cuda(), **var_params) q = Variable(q.cuda(), **var_params) a = Variable(a.cuda(), **var_params) b = Variable(b.cuda(), **var_params) q_len = Variable(q_len.cuda(), **var_params) v_mask = Variable(v_mask.cuda(), **var_params) q_mask = Variable(q_mask.cuda(), **var_params) out = net(v, b, q, v_mask, q_mask, q_len) answer = utils.process_answer(a) loss = utils.calculate_loss(answer, out, method=config.loss_method) acc = utils.batch_accuracy(out, answer).data.cpu() if train: optimizer.zero_grad() loss.backward() # clip gradient clip_grad_norm_(net.parameters(), config.clip_value) optimizer.step() if config.schedule_method == 'batch_decay': scheduler.step() loss_tracker.append(loss.item()) acc_tracker.append(acc.mean()) fmt = '{:.4f}'.format loader.set_postfix(loss=fmt(loss_tracker.mean.value), acc=fmt(acc_tracker.mean.value)) return acc_tracker.mean.value, loss_tracker.mean.value
def train(epoch): print('\nEpoch: %d' % epoch) snet.train() decoder.train() train_loss = 0 train_cls_loss = 0 conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES)) if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = args.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (img_teacher, img_student, target) in enumerate(trainloader): if args.cuda: img_teacher = img_teacher.cuda(non_blocking=True) img_student = img_student.cuda(non_blocking=True) target = target.cuda(non_blocking=True) optimizer.zero_grad() if args.augmentation: img_teacher, teacher_target_a, teacher_target_b, teacher_lam = mixup_data( img_teacher, target, 0.6) img_teacher, teacher_target_a, teacher_target_b = map( Variable, (img_teacher, teacher_target_a, teacher_target_b)) img_student, student_target_a, student_target_b, student_lam = mixup_data( img_student, target, 0.6) img_student, student_target_a, student_target_b = map( Variable, (img_student, student_target_a, student_target_b)) else: img_teacher, img_student, target = Variable(img_teacher), Variable( img_student), Variable(target) rb1_s, rb2_s, rb3_s, mimic_s, out_s = snet(img_student) rb1_t, rb2_t, rb3_t, mimic_t, out_t = tnet(img_teacher) if args.augmentation: cls_loss = mixup_criterion(Cls_crit, out_s, student_target_a, student_target_b, student_lam) else: cls_loss = Cls_crit(out_s, target) kd_loss = KD_T_crit(out_t, out_s) if args.distillation == 'KD': loss = 0.2 * cls_loss + 0.8 * kd_loss elif args.distillation == 'DE': new_rb1_s = decoder(rb1_s) decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(), MSE_crit) loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.1 * decoder_loss elif args.distillation == 'AS': rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda() loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.9 * rb2_loss elif args.distillation == 'DEAS': new_rb1_s = decoder(rb1_s) decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(), MSE_crit) rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda() loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.1 * decoder_loss + 0.9 * rb2_loss elif args.distillation == 'SSDEAS': new_rb1_s = decoder(rb1_s) decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(), MSE_crit) rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda() loss = 0 * cls_loss + 0 * kd_loss + 0.1 * decoder_loss + 0.9 * rb2_loss else: raise Exception('Invalid distillation name...') loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() train_cls_loss += cls_loss.item() if args.augmentation: conf_mat_a += losses.confusion_matrix(out_s, student_target_a, NUM_CLASSES) acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0]) ]) / conf_mat_a.sum() precision_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) recall_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) mAP_a = sum(precision_a) / len(precision_a) F1_score_a = (2 * precision_a * recall_a / (precision_a + recall_a + 1e-10)).mean() conf_mat_b += losses.confusion_matrix(out_s, student_target_b, NUM_CLASSES) acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0]) ]) / conf_mat_b.sum() precision_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) recall_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) mAP_b = sum(precision_b) / len(precision_b) F1_score_b = (2 * precision_b * recall_b / (precision_b + recall_b + 1e-10)).mean() acc = student_lam * acc_a + (1 - student_lam) * acc_b mAP = student_lam * mAP_a + (1 - student_lam) * mAP_b F1_score = student_lam * F1_score_a + (1 - student_lam) * F1_score_b else: conf_mat += losses.confusion_matrix(out_s, target, NUM_CLASSES) acc = sum([conf_mat[i, i] for i in range(conf_mat.shape[0])]) / conf_mat.sum() precision = [ conf_mat[i, i] / (conf_mat[i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] mAP = sum(precision) / len(precision) recall = [ conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] precision = np.array(precision) recall = np.array(recall) f1 = 2 * precision * recall / (precision + recall + 1e-10) F1_score = f1.mean() #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%' #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score)) return train_cls_loss / (batch_idx + 1), 100. * acc, 100. * mAP, 100 * F1_score
def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES)) conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES)) if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # set the decayed rate else: current_lr = args.lr print('learning_rate: %s' % str(current_lr)) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() if args.augmentation: inputs, targets_a, targets_b, lam = mixup_data( inputs, targets, 0.6) inputs, targets_a, targets_b = map(Variable, (inputs, targets_a, targets_b)) else: inputs, targets = Variable(inputs), Variable(targets) _, _, _, _, outputs = net(inputs) if args.augmentation: loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam) else: loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() if args.augmentation: conf_mat_a += losses.confusion_matrix(outputs, targets_a, NUM_CLASSES) acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0]) ]) / conf_mat_a.sum() precision_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) recall_a = np.array([ conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10) for i in range(conf_mat_a.shape[0]) ]) mAP_a = sum(precision_a) / len(precision_a) F1_score_a = (2 * precision_a * recall_a / (precision_a + recall_a + 1e-10)).mean() conf_mat_b += losses.confusion_matrix(outputs, targets_b, NUM_CLASSES) acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0]) ]) / conf_mat_b.sum() precision_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) recall_b = np.array([ conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10) for i in range(conf_mat_b.shape[0]) ]) mAP_b = sum(precision_b) / len(precision_b) F1_score_b = (2 * precision_b * recall_b / (precision_b + recall_b + 1e-10)).mean() acc = lam * acc_a + (1 - lam) * acc_b mAP = lam * mAP_a + (1 - lam) * mAP_b F1_score = lam * F1_score_a + (1 - lam) * F1_score_b else: conf_mat += losses.confusion_matrix(outputs, targets, NUM_CLASSES) acc = sum([conf_mat[i, i] for i in range(conf_mat.shape[0])]) / conf_mat.sum() precision = [ conf_mat[i, i] / (conf_mat[i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] mAP = sum(precision) / len(precision) recall = [ conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10) for i in range(conf_mat.shape[0]) ] precision = np.array(precision) recall = np.array(recall) f1 = 2 * precision * recall / (precision + recall + 1e-10) F1_score = f1.mean() #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%' #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score)) return train_loss / (batch_idx + 1), 100. * acc, 100. * mAP, 100 * F1_score