def train(epoch): net.train() net.training = True train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) for name, param in net.named_parameters(): print(name, param.shape) if "param" not in name: param.register_hook(lambda grad: grad * 0) # for name, module in net.named_modules(): # module.register_hook(lambda grad: grad * 0) #net.module.conv1.weight.register_hook(lambda grad: grad * 0) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update #print(net.module.conv1.weight[1]) # print(net.module.layer1[1].conv1.weight[1]) # print(net.module.layer1[1].parameter) feature_ranks = {} for name, param in net.named_parameters(): print(name, param.shape) if "param" in name: print(name) sorted_features = torch.argsort(param, descending=True) feature_ranks[name] = sorted_features print(sorted_features) np.save("ranks.npy", feature_ranks) train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.item(), 100. * correct / total)) sys.stdout.flush()
def train(epoch): net.train() net.training = True train_loss = 0 correct = 0 total = 0 optimizer =optim.SGD([ {'params': param_core,'weight_decay': 5e-4}, {'params': params_multi, 'weight_decay': 0.0} ], lr=cf.learning_rate(args.lr, epoch), momentum=0.9) print('\n=> Training Epoch #%d, LR=%.4f' %(epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): inputs=tile(inputs,0,ensemble_size) targets=tile(targets,0,ensemble_size) if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) + mu_div*loss_latent_from_nn(net)# Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' %(epoch, num_epochs, batch_idx+1, (len(trainset)//batch_size)+1, loss.item(), 100.*correct/total)) sys.stdout.flush()
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets, weights) in enumerate(api.train_loader): if use_cuda: inputs, targets, weights = inputs.cuda(), targets.cuda( ), weights.cuda() # GPU settings optimizer.zero_grad() #print('data shapes ', 'inputs ', inputs.shape, 'targets ', targets.shape, 'weights ', weights.shape) inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = api.loss_func(outputs, targets, weights) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() # cluster trajectory + reweight data if epoch >= args.burn_in and ((epoch - args.burn_in) % args.interval) == 0: api.clusterTrajectory() # run gmm cluster api.reweightData(net, 1000000) # update train_loader weight_his.append( np.expand_dims(api.weight_tensor.detach().numpy(), axis=1).tolist()) api.generateTrainLoader() train_loss = train_loss / total acc = 100. * correct.item() / total print('train loss\t\t', train_loss) print('correct\t\t', correct, '\t\ttotal\t\t', total) print('acc\t\t', acc) train_loss_his.append(train_loss) train_acc_his.append(acc) # record trajectory api.createTrajectory(net) print('| Epoch [%3d/%3d] \t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, train_loss, 100. * correct / total))
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.data[0] _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.data[0], 100. * correct.numpy() / total)) sys.stdout.flush()
def train(net, dataloader, optimizer, epoch): criterion = nn.CrossEntropyLoss() net.train() train_loss = 0 correct = 0 total = 0 print('\n=> [%s] Training Epoch #%d, lr=%.4f' % (model_name, epoch, cf.learning_rate(args.lr, epoch))) log_file.write('\n=> [%s] Training Epoch #%d, lr=%.4f\n' % (model_name, epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(dataloader): inputs = inputs.to(device) targets = targets.to(device) # obtain soft_target by forwarding data in test mode if epoch >= args.distill_from and args.distill > 0: with torch.no_grad(): net.eval() soft_target = net(inputs) net.train() optimizer.zero_grad() outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss # compute distillation loss if epoch >= args.distill_from and args.distill > 0: heat_output = outputs / args.temp heat_soft_target = soft_target / args.temp distill_loss = F.kl_div( F.log_softmax(heat_output, 1), F.softmax(heat_soft_target), size_average=False) / targets.size(0) * (args.temp * args.temp) loss = loss + args.distill * distill_loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.detach(), 1) total += targets.size(0) correct += predicted.eq(targets.detach()).long().sum().item() if math.isnan(loss.item()): print('@@@@@@@nan@@@@@@@@@@@@') log_file.write('@@@@@@@@@@@nan @@@@@@@@@@@@@\n') sys.exit(0) sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, args.num_epochs, batch_idx + 1, (len(trainset) // args.bs) + 1, loss.item(), 100. * correct / total)) sys.stdout.flush() log_file.write( '| Epoch [%3d/%3d] \t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, args.num_epochs, loss.item(), 100. * correct / total))
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 if optim_type == 'SGD': optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4, nesterov=True) elif optim_type == 'ADAM': optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4) else: raise AssertionError("Unknown optimizer name: {}".format(optim_type)) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings net.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss1 = criterion(outputs, targets) # Loss loss2, e_loss, v_loss, E_loss, V_loss = regularization(net) loss = loss1 + loss2 loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum().float() acc = 100.0 * correct / total iter = (epoch - 1) * iters_in_epoch + batch_idx writer.add_scalar('train/accuracy', acc, iter) writer.add_scalar('train/loss1', loss1.item(), iter) writer.add_scalar('train/loss2', loss2.item(), iter) writer.add_scalar('train/loss2_e', e_loss.item(), iter) writer.add_scalar('train/loss2_v', v_loss.item(), iter) writer.add_scalar('train/loss', loss.item(), iter) scalar_to_tensorboard(E_loss, 'train/E_loss', writer, iter) scalar_to_tensorboard(V_loss, 'train/V_loss', writer, iter) histogram_to_tensorboard(net.e_net, 'train/e_net', writer, iter) histogram_to_tensorboard(net.v_net, 'train/v_net', writer, iter) sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.item(), acc)) sys.stdout.flush()
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.Adam(net.parameters(), lr=cf.learning_rate(cf.lr, epoch), weight_decay=cf.weight_decay) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(cf.lr, epoch))), m = math.ceil(len(testset) / cf.batch_size) for batch_idx, (inputs_value, targets) in enumerate(trainloader): # targets = torch.tensor(targets) x = inputs_value.view(-1, inputs, resize, resize) y = targets if use_cuda: x, y = x.cuda(), y.cuda() # GPU settings if cf.beta_type is "Blundell": beta = 2**(m - (batch_idx + 1)) / (2**m - 1) elif cf.beta_type is "Soenderby": beta = min(epoch / (cf.num_epochs // 4), 1) elif cf.beta_type is "Standard": beta = 1 / m else: beta = 0 # Forward Propagation x, y = Variable(x), Variable(y) outputs, kl = net.probforward(x) loss = vi(outputs, y, kl, beta) # Loss optimizer.zero_grad() loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(y.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, cf.num_epochs, batch_idx + 1, (len(trainset) // cf.batch_size) + 1, loss.item(), (100 * (correct.item() / total)))) sys.stdout.flush() trainLoss.append(loss.item()) trainAcc.append((100 * (correct.item() / total))) diagnostics_to_write = { 'Epoch': epoch, 'Loss': loss.item(), 'Accuracy': (100 * (correct.item() / total)) } with open(logfile, 'a') as lf: lf.write(str(diagnostics_to_write))
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 temp1_accum = 0 temp2_accum = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), nesterov=True, momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss temp1_accum = loss.detach().cpu() * ( 1. / (batch_idx + 1.)) + temp1_accum * (batch_idx / (batch_idx + 1.)) if args.loss == 'bce': loss = torch.zeros(1).cuda() if 'bce' in args.loss: bce_targets = target_transform_for_elementwise_bce( targets, num_classes).cuda() if num_classes > 30: new_outputs, new_targets = sampling_for_loss(outputs, targets) temp2 = criterion2(F.sigmoid(new_outputs), new_targets).cuda() else: temp2 = criterion2(F.sigmoid(outputs), bce_targets) temp2_accum = temp2.detach().cpu() * ( 1. / (batch_idx + 1.)) + temp2_accum * (batch_idx / (batch_idx + 1.)) loss += temp2 loss = loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f CE_loss : %.4f BCE_loss : %4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.item(), temp1_accum, temp2_accum, 100. * correct / total)) sys.stdout.flush()
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 if (args.resume): params = net.module.linear.parameters() else: params = net.parameters() optimizer = optim.SGD(params, lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) if (use_noise): loader = trainloader_noise else: loader = trainloader_clean for batch_idx, (inputs_c, targets_c) in enumerate(loader): if use_cuda: inputs_c, targets_c = inputs_c.cuda(), targets_c.cuda() optimizer.zero_grad() if (sim_learning): (outputs, matrices_reg) = net(inputs_c, compute_similarity=True) (_, matrices_rob) = robustNet(inputs_c, img_type="clean", compute_similarity=True) loss_similarity = 0. for i, (r, g) in enumerate(zip(matrices_reg, matrices_rob)): sim_loss = get_sim_loss(i, r, g, 1e-4) loss_similarity = loss_similarity + sim_loss loss = criterion(outputs, targets_c) + loss_similarity # Loss else: outputs = net(inputs_c, compute_similarity=False) loss = criterion(outputs, targets_c) loss.backward() optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets_c.size(0) correct += predicted.eq(targets_c.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\t Loss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset_noise) // batch_size) + 1, loss.item(), 100. * correct / total)) sys.stdout.flush()
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, ((inputs1, targets1), (inputs2, targets2)) in enumerate( zip(trainloader_noise, trainloader_clean)): if use_cuda: inputs1, targets1 = inputs1.cuda(), targets1.cuda() # GPU settings inputs2, targets2 = inputs2.cuda(), targets2.cuda() optimizer.zero_grad() outputs_n = net(inputs1, img_type="noise", compute_similarity=False) l1 = criterion(outputs_n, targets1) #l1.backward(retain_graph=False) #optimizer.step() #optimizer.zero_grad() outputs_c = net(inputs2, img_type="clean", compute_similarity=False) l2 = criterion(outputs_c, targets2) #l2.backward(retain_graph=False) #optimizer.step() #optimizer.zero_grad() l3 = w_loss(outputs_n, outputs_c) readout_losses.append(l3.item()) #l3.backward(retain_graph=False) #optimizer.step() # Optimizer update loss = l1 + l2 + l3 loss.backward() optimizer.step() train_loss += loss.item() _, predicted = torch.max(outputs_c.data, 1) total += targets2.size(0) correct += predicted.eq(targets2.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\t Total Loss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset_noise) // batch_size) + 1, loss.item(), 100. * correct / total)) sys.stdout.flush()
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' %(epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs_value, targets) in enumerate(trainloader): if use_cuda: inputs_value, targets = inputs_value.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs_value, targets = Variable(inputs_value), Variable(targets) outputs = net.forward(inputs_value) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.data _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Cor@1: %.3f%% \tTotal%.3f%%' %(epoch, num_epochs, batch_idx+1, (len(trainset)//batch_size)+1, loss.data, correct,total)) sys.stdout.flush() diagnostics_to_write = {'Epoch': epoch, 'Loss': loss.data, 'Accuracy': 100*correct / total} with open(logfile, 'a') as lf: lf.write(str(diagnostics_to_write))
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() #print('data shapes ', 'inputs ', inputs.shape, 'targets ', targets.shape) inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() train_loss = train_loss / total acc = 100. * correct.item() / total print('train loss\t\t', train_loss) print('correct\t\t', correct, '\t\ttotal\t\t', total) print('acc\t\t', acc) train_loss_his.append(train_loss) train_acc_his.append(acc) #sys.stdout.write('\r') #sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]' # %(epoch, num_epochs, batch_idx+1, # (len(trainset)//batch_size)+1)) #sys.stdout.flush() print('| Epoch [%3d/%3d] \t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, train_loss, acc))
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr * batch_size, epoch, args.warmup_epoch, 0, len(trainloader), hvd.size()))) for batch_idx, (inputs, targets) in enumerate(trainloader): lr = cf.learning_rate(args.lr * batch_size, epoch, args.warmup_epoch, batch_idx, len(trainloader), hvd.size()) for param_group in optimizer.param_groups: param_group['lr'] = lr inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.data.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() print( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%% LR: %.8f' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.data.item(), 100. * correct / total, lr)) if hvd.rank() == 0: save_dict = { "epoch": epoch, "optimizer": optimizer.state_dict(), "state_dict": net.state_dict() } torch.save( save_dict, os.path.join( '/home/lunit/Pytorch-Horovod-Examples/examples/cifar100/checkpoints/', 'cifar100_last.pth.tar'))
def train(epoch): net.train() train_loss = 0 correct = 0 total = 0 num_classes = args.num_classes optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), nesterov=True, momentum=0.9, weight_decay=5e-4) # optimizer = optim.Adam(net.parameters(), lr=cf.learning_rate(args.lr, epoch), betas=(0.5,0.999), weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, _) in enumerate(out_testloader): if use_cuda: inputs = inputs.cuda() # GPU settings optimizer.zero_grad() inputs = Variable(inputs) outputs = net(inputs) # Forward Propagation # targets = torch.ones_like(outputs[:,-1]).cuda() targets = torch.zeros_like(outputs).cuda() targets[:, -1] = 1 loss = F.binary_cross_entropy_with_logits(outputs, targets) loss.backward() optimizer.step() max_logit, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(10).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(out_testset) // batch_size) + 1, loss.item(), float(100.00 * float(correct) / float(total)))) sys.stdout.flush()
def train(epoch): net.train() net.training = True train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD( net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4, ) logbook.write_message( f"Training Epoch {epoch}, LR {cf.learning_rate(args.lr, epoch)}" ) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() # sys.stdout.write("\r") # sys.stdout.write( # "| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%" # % ( # epoch, # num_epochs, # batch_idx + 1, # (len(trainset) // batch_size) + 1, # loss.item(), # 100.0 * correct / total, # ) # ) logbook.write_metric( { "epoch": epoch, "iter": batch_idx + 1, "loss": loss.item(), "acc@1": 100.0 * correct.item() / total, "mode": "train", } )
def train(epoch): model.train() train_loss = 0 correct = 0 optimizer = optim.SGD(model.parameters(), lr=config.learning_rate(0.1, epoch), momentum=0.9, weight_decay=5e-4) #optimizer = optim.SGD(model.parameters(), lr=0.1*0.0008, momentum=0.9, weight_decay=5e-4) for batch_idx, (data, target) in enumerate(train_loader): if use_cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) if batch_idx == 0: torch.save(data, './data.pkl') optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() if batch_idx % 10 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) log_value('loss', loss, 391 * (epoch - 1) + batch_idx) # sum up batch loss train_loss += criterion(output, target).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).cpu().sum() if epoch % 20 == 0: if torch.cuda.device_count() > 1: torch.save(model.module.state_dict(), OUTPATH + str(epoch)) train_loss = train_loss / (len(train_loader.dataset) // BATCH) print('\nTrain set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'. format(train_loss, correct, len(train_loader.dataset), 100. * correct / len(train_loader.dataset))) log_value('train_acc', 100. * correct / len(train_loader.dataset), epoch)
def train(epoch, global_step): print('Epoch {:3d} {:3.2f}'.format(epoch, 0), end='') for batch in range(len(labels) // config.batch_size()): L = batch * config.batch_size() R = L + config.batch_size() mini_batch_images = images[:, L:R, :] mini_batch_labels = labels[L:R] feed_dict = { input_images: mini_batch_images, input_labels: mini_batch_labels, learning_rate: config.learning_rate(epoch=epoch, steps=global_step) } session.run(optimizer, feed_dict=feed_dict) # grads = tape.gradient(loss, model.trainable_variables) # optimizer.apply_gradients(zip(grads, model.trainable_variables), global_step=tf.train.get_or_create_global_step()) global_step += 1 print('\rEpoch {:3d} {:3.2f}'.format(epoch, L * 100.0 / len(labels)), end='') print('\rEpoch {:3d} {:3.2f}'.format(epoch, 100.0), end='')
def mlp_run(experiment_name, operand_bits, operator, hidden_units, str_device_num, nn_model_type, tlu_on): def train(sess, batch_input, batch_target, float_epoch, all_correct_val): _, _, _ = sess.run( [loss, op_accuracy, train_op], feed_dict={ inputs: batch_input, targets: batch_target, condition_tlu: False, training_epoch: float_epoch, big_batch_training: big_batch_training_val, all_correct_epoch: (all_correct_val * float_epoch), all_correct: all_correct_val }) def write_train_summary(sess, compute_nodes, batch_input, batch_target, float_epoch, all_correct_val, step): # Run computing train loss, accuracy train_loss, train_accuracy, merged_summary_op_val = sess.run( compute_nodes, feed_dict={ inputs: batch_input, targets: batch_target, condition_tlu: False, training_epoch: float_epoch, big_batch_training: big_batch_training_val, all_correct_epoch: (all_correct_val * float_epoch), all_correct: all_correct_val }) ##print("epoch: {}, step: {}, train_loss: {}, train_accuracy: {}".format(epoch, step, train_loss, train_accuracy)) #train_summary_writer.add_summary(merged_summary_op_val, step) return (train_loss, train_accuracy) def write_dev_summary(sess, compute_nodes, float_epoch, all_correct_val, step): dev_loss, dev_accuracy, merged_summary_op_val, dev_op_wrong_val, per_digit_accuracy_val, per_digit_wrong_val = sess.run( compute_nodes, feed_dict={ inputs: input_dev, targets: target_dev, condition_tlu: False, training_epoch: float_epoch, big_batch_training: big_batch_training_val, all_correct_epoch: (all_correct_val * float_epoch), all_correct: all_correct_val }) ##print("└ epoch: {}, step: {}, dev_loss: {}, dev_accuracy: {}, op_wrong: {}".format(epoch, step, dev_loss, dev_accuracy, op_wrong_val)) #dev_summary_writer.add_summary(merged_summary_op_val, step) return (dev_loss, dev_accuracy, dev_op_wrong_val, per_digit_accuracy_val, per_digit_wrong_val) def write_tlu_dev_summary(sess, compute_nodes, float_epoch, all_correct_val, step): dev_loss_tlu, dev_accuracy_tlu, merged_summary_op_val, dev_op_wrong_val_tlu, _, _ = sess.run( compute_nodes, feed_dict={ inputs: input_dev, targets: target_dev, condition_tlu: True, training_epoch: float_epoch, big_batch_training: big_batch_training_val, all_correct_epoch: (all_correct_val * float_epoch), all_correct: all_correct_val }) ##print("└ [TLU] epoch: {}, step: {}, dev_loss: {}, dev_accuracy: {}, op_wrong: {}".format(epoch, step, dev_loss_tlu, dev_accuracy_tlu, op_wrong_val_tlu)) #tlu_summary_writer.add_summary(merged_summary_op_val, step) return (dev_loss_tlu, dev_accuracy_tlu, dev_op_wrong_val_tlu) def write_test_summary(sess, compute_nodes, float_epoch, all_correct_val, step): test_loss, test_accuracy, merged_summary_op_val, op_wrong_val = sess.run( compute_nodes, feed_dict={ inputs: input_test, targets: target_test, condition_tlu: False, training_epoch: float_epoch, big_batch_training: big_batch_training_val, all_correct_epoch: (all_correct_val * float_epoch), all_correct: all_correct_val }) print( "└ epoch: {}, step: {}, test_loss: {}, test_accuracy: {}, op_wrong: {}" .format(epoch, step, test_loss, test_accuracy, op_wrong_val)) #test_summary_writer.add_summary(merged_summary_op_val, step) return (test_loss, test_accuracy, op_wrong_val) def write_carry_datasets_summary(sess, compute_nodes, float_epoch, all_correct_val, step): value_dict = dict() for n_carries in carry_datasets.keys(): carry_dataset_input = carry_datasets[n_carries]['input'] carry_dataset_output = carry_datasets[n_carries]['output'] carry_loss_val, carry_accuracy_val, merged_summary_op_val, carry_op_wrong_val, carry_per_digit_accuracy_val, carry_per_digit_wrong_val = sess.run( compute_nodes, feed_dict={ inputs: carry_dataset_input, targets: carry_dataset_output, condition_tlu: False, training_epoch: float_epoch, big_batch_training: big_batch_training_val, all_correct_epoch: (all_correct_val * float_epoch), all_correct: all_correct_val }) value_dict[n_carries] = (carry_loss_val, carry_accuracy_val, carry_op_wrong_val, carry_per_digit_accuracy_val, carry_per_digit_wrong_val) #carry_datasets_summary_writers[n_carries].add_summary(merged_summary_op_val, step) return value_dict def write_embeddings_summary(sess, h1): # Reference: https://stackoverflow.com/questions/40849116/how-to-use-tensorboard-embedding-projector dir_logs = os.path.join(config.dir_saved_models(), experiment_name) metadata = os.path.join(dir_logs, 'metadata.tsv') carry_datasets = data_utils.import_carry_datasets( operand_bits, operator) input_arrays = list() with open(metadata, 'w') as f: for carries in carry_datasets.keys(): input_arrays.append(carry_datasets[carries]['input']) f.write('{}\n'.format(carries)) carry_inputs = np.concatenate(input_arrays, axis=0) [h1_val] = sess.run([h1], feed_dict={ inputs: carry_inputs, condition_tlu: False }) h1_var = tf.Variable(h1_val, name='h1_var') saver = tf.train.Saver([h1_var]) sess.run(h1_var.initializer) saver.save(sess, os.path.join(dir_logs, 'h1_var.ckpt')) pconfig = projector.ProjectorConfig() pconfig.model_checkpoint_path = os.path.join(dir_logs, 'h1_var.ckpt') embedding = pconfig.embeddings.add() embedding.tensor_name = h1_var.name embedding.metadata_path = metadata projector.visualize_embeddings(tf.summary.FileWriter(dir_logs), pconfig) def create_carry_datasets_summary_writers(logdir, carry_datasets): carry_datasets_summary_writers = dict() for n_carries in carry_datasets.keys(): carry_datasets_summary_writers[n_carries] = tf.summary.FileWriter( logdir + '/carry-{}'.format(n_carries)) return carry_datasets_summary_writers def close_carry_datasets_summary_writers(carry_datasets_summary_writers): for n_carries in carry_datasets_summary_writers.keys(): carry_datasets_summary_writers[n_carries].close() def get_all_correct_val(op_wrong_val): if op_wrong_val == 0: return True else: return False def is_last_batch(i_batch): if i_batch == (n_batch - 1): return True else: return False def decrease_dev_summary_period(dev_accuracy_val, op_wrong_val): # Preconditions if not decreasing_dev_summary_period: return if dev_accuracy_val < 0.999: return # If the preconditions are satisfied, ... if op_wrong_val <= 8: dev_summary_period = int(init_dev_summary_period // 128) elif op_wrong_val <= 16: dev_summary_period = int(init_dev_summary_period // 64) if op_wrong_val <= 32: dev_summary_period = int(init_dev_summary_period // 32) elif op_wrong_val <= 64: dev_summary_period = int(init_dev_summary_period // 16) elif op_wrong_val <= 128: dev_summary_period = int(init_dev_summary_period // 8) if op_wrong_val > 512: dev_summary_period = init_dev_summary_period ############################################################################ # Running point. os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = str_device_num # 0, 1 os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable all debugging logs: Unable to display GPU info when running on the bash # Import datasets (train_ratio, dev_ratio, test_ratio) = config.dataset_ratio() (input_train, input_dev, input_test, target_train, target_dev, target_test) = data_utils.import_op_dataset(operator, operand_bits, train_ratio=train_ratio, dev_ratio=dev_ratio, test_ratio=test_ratio) if operator in config.operators_list(): carry_datasets = data_utils.import_carry_datasets( operand_bits, operator) # If the training dataset takes all examples, then the dev and test datasets are the same as the training one. if dev_ratio == 0.0 and test_ratio == 0.0: input_dev = input_train target_dev = target_train input_test = input_train target_test = target_train if dev_ratio == 0.0 and test_ratio != 0.0: input_dev = input_test target_dev = target_test # Contants NN_INPUT_DIM = input_train.shape[1] NN_OUTPUT_DIM = target_train.shape[1] # Hyperparameters - training batch_size = config.batch_size() big_batch_size = config.big_batch_size() n_epoch = config.n_epoch() learning_rate = config.learning_rate() all_correct_stop = config.all_correct_stop() big_batch_saturation = config.big_batch_saturation() if big_batch_saturation: all_correct_stop = False # Hyperparameters - model activation = config.activation() # tf.nn.sigmoid, tf.nn.tanh, tf.nn.relu str_activation = utils.get_str_activation(activation) h_layer_dims = [hidden_units] # h_layer_dims[0]: dim of h1 layer last_size = NN_OUTPUT_DIM # Variables determined by other variables train_size = input_train.shape[0] n_batch = train_size // batch_size # Print periods train_summary_period = n_batch // 4 # 4 times per epoch init_dev_summary_period = n_batch # n_batch: print at every epoch dev_summary_period = init_dev_summary_period decreasing_dev_summary_period = config.decreasing_dev_summary_period() # Weight initialization ## https://www.tensorflow.org/api_docs/python/tf/contrib/layers/variance_scaling_initializer if activation == tf.nn.relu: init_factor = 2.0 if activation == tf.nn.sigmoid: init_factor = 1.0 if activation == tf.nn.tanh: init_factor = 1.0 fan_in_1 = NN_INPUT_DIM fan_in_2 = h_layer_dims[0] ############################################################################ # Creating a computational graph. # Initializing paraters to learn. with tf.name_scope('parameter'): W1 = tf.Variable(tf.truncated_normal( (NN_INPUT_DIM, h_layer_dims[0]), stddev=np.sqrt(init_factor / fan_in_1)), name="W1") b1 = tf.Variable(tf.zeros((h_layer_dims[0])), name="b1") W2 = tf.Variable(tf.truncated_normal( (h_layer_dims[0], NN_OUTPUT_DIM), stddev=np.sqrt(init_factor / fan_in_2)), name="W2") b2 = tf.Variable(tf.zeros((NN_OUTPUT_DIM)), name="b2") # Setting the input and target output. inputs = tf.placeholder(tf.float32, shape=(None, input_train.shape[1]), name='inputs') # None for mini-batch size targets = tf.placeholder(tf.float32, shape=(None, target_train.shape[1]), name='targets') condition_tlu = tf.placeholder(tf.int32, shape=(), name="tlu_condition") is_tlu_hidden = tf.greater(condition_tlu, tf.constant(0, tf.int32)) #is_tlu_hidden = tf.constant(condition_tlu == True, dtype=tf.bool) # https://github.com/pkmital/tensorflow_tutorials/issues/36 # NN structure with tf.name_scope('layer1'): h1_logits = tf.add(tf.matmul(inputs, W1), b1) h1 = tf.cond( is_tlu_hidden, lambda: utils.tf_tlu(h1_logits, name='h1_tlu'), lambda: activation(h1_logits, name='h1') ) # https://stackoverflow.com/questions/35833011/how-to-add-if-condition-in-a-tensorflow-graph / https://www.tensorflow.org/versions/r1.7/api_docs/python/tf/cond with tf.name_scope('layer2'): last_logits = tf.add(tf.matmul(h1, W2), b2) sigmoid_outputs = tf.sigmoid(last_logits) predictions = utils.tf_tlu(sigmoid_outputs, name='predictions') # Loss: objective function with tf.name_scope('loss'): loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=targets, logits=last_logits ) # https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits loss = tf.reduce_mean(loss) if config.l1_coef() != 0: loss = loss \ + config.l1_coef() / (2 * batch_size) * (tf.reduce_sum(tf.abs(W1)) + tf.reduce_sum(tf.abs(W2))) # + config.l1_coef() / (2 * batch_size) * (tf.reduce_sum(tf.abs(tf.abs(W1) - 1)) + tf.reduce_sum(tf.abs(tf.abs(W2) - 1))) if config.l2_coef() != 0: loss = loss \ + config.l2_coef() / (2 * batch_size) * (tf.reduce_sum(tf.square(W1)) + tf.reduce_sum(tf.square(W2))) # Get measures: # [1] operation measures (accuracy, n_wrong, n_correct) # [2] mean digits accuracy (mean_digits_accuracy) # [3] per digit accuracy (per_digit_accuracy) (op_accuracy, op_wrong, op_correct, digits_mean_accuracy, digits_mean_wrong, digits_mean_correct, per_digit_accuracy, per_digit_wrong, per_digit_correct) = utils.get_measures(targets, predictions) # Training, optimization train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) init = tf.global_variables_initializer() training_epoch = tf.placeholder(tf.float32, shape=None) all_correct_epoch = tf.placeholder(tf.float32, shape=None) big_batch_training = tf.placeholder(tf.int32, shape=None) all_correct = tf.placeholder(tf.int32, shape=None) # Summary: Scalar ## Measures tf.summary.scalar('loss', loss) with tf.name_scope('operation'): tf.summary.scalar('accuracy', op_accuracy) tf.summary.scalar('wrong', op_wrong) with tf.name_scope('digits'): tf.summary.scalar('mean_accuracy', digits_mean_accuracy) tf.summary.scalar('mean_wrong', digits_mean_wrong) with tf.name_scope('per_digit'): for i in range(NN_OUTPUT_DIM): tf.summary.scalar('digit-{}/accuracy'.format(i + 1), per_digit_accuracy[-(i + 1)]) tf.summary.scalar('digit-{}/wrong'.format(i + 1), per_digit_wrong[-(i + 1)]) # add per_digit_correct tf.summary.scalar('epoch', training_epoch) tf.summary.scalar('all_correct_epoch', all_correct_epoch) tf.summary.scalar('big_batch_training', big_batch_training) tf.summary.scalar('all_correct', all_correct) tf.summary.scalar('condition_tlu', condition_tlu) # Summary: Histogram with tf.name_scope('layer1'): tf.summary.histogram('weight', W1) tf.summary.histogram('bias', b1) tf.summary.histogram('activation', h1) with tf.name_scope('layer2'): tf.summary.histogram('weight', W2) tf.summary.histogram('bias', b2) tf.summary.histogram('activation', sigmoid_outputs) # Merge summary operations merged_summary_op = tf.summary.merge_all() run_info = utils.init_run_info(NN_OUTPUT_DIM) # Experiment info run_info['experiment_name'] = experiment_name # Problem info run_info['operator'] = operator run_info['operand_bits'] = operand_bits run_info['result_bits'] = target_train.shape[1] # Network info run_info['network_input_dimension'] = input_train.shape[1] run_info['network_output_dimension'] = target_train.shape[1] run_info['hidden_activation'] = str_activation run_info['hidden_dimensions'] = h_layer_dims # Dataset info run_info['train_set_size'] = input_train.shape[0] run_info['dev_set_size'] = input_dev.shape[0] run_info['test_set_size'] = input_test.shape[0] # Optimizer info run_info['batch_size'] = batch_size run_info['optimizer'] = train_op.name run_info['learning_rate'] = learning_rate run_info['all_correct_stop'] = all_correct_stop run_id = datetime.now().strftime('%Y%m%d%H%M%S') run_info['run_id'] = run_id # Train logging logdir = '{}/{}/{}_{}bit_{}_{}_h{}_run-{}/'.format( config.dir_logs(), experiment_name, operator, operand_bits, nn_model_type, str_activation, h_layer_dims, run_id) #train_summary_writer = tf.summary.FileWriter(logdir + '/train', graph=tf.get_default_graph()) #dev_summary_writer = tf.summary.FileWriter(logdir + '/dev') #if tlu_on: # tlu_summary_writer = tf.summary.FileWriter(logdir + '/tlu') #test_summary_writer = tf.summary.FileWriter(logdir + '/test') #if operator in config.operators_list(): # carry_datasets_summary_writers = create_carry_datasets_summary_writers(logdir, carry_datasets) # Model saving #dir_saved_model = '{}/{}/{}_{}bit_{}_{}_h{}/run-{}/'.format( # config.dir_saved_models(), experiment_name, operator, operand_bits, nn_model_type, str_activation, h_layer_dims, run_id) #utils.create_dir(dir_saved_model) #model_saver = tf.train.Saver() #init_all_correct_model_saver = tf.train.Saver() # Compute nodes train_compute_nodes = [loss, op_accuracy, merged_summary_op] dev_compute_nodes = [ loss, op_accuracy, merged_summary_op, op_wrong, per_digit_accuracy, per_digit_wrong ] test_compute_nodes = [loss, op_accuracy, merged_summary_op, op_wrong] # Session configuration tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True print("Run ID: {}".format(run_id)) print(logdir) #print(dir_saved_model) with tf.Session(config=tf_config) as sess: sess.run(init) float_epoch = 0.0 all_correct_val = False big_batch_training_val = False init_all_correct_model_saved = False for epoch in range(n_epoch): input_train, target_train = utils.shuffle_np_arrays( input_train, target_train) if big_batch_saturation and all_correct_val: big_batch_training_val = True batch_size = big_batch_size for i_batch in range(n_batch): # Get mini-batch batch_input, batch_target = utils.get_batch( i_batch, batch_size, input_train, target_train) # Initial state evalutation: No training if epoch == 0 and i_batch == 0: step = 0 float_epoch = 0.0 write_train_summary(sess, train_compute_nodes, batch_input, batch_target, float_epoch, all_correct_val, step) write_dev_summary(sess, dev_compute_nodes, float_epoch, all_correct_val, step) if tlu_on: write_tlu_dev_summary(sess, dev_compute_nodes, float_epoch, all_correct_val, step) # Set step, float_epoch ## 1 <= (i_batch + 1) <= n_batch step = n_batch * epoch + (i_batch + 1) float_epoch = epoch + float(i_batch + 1) / n_batch # Training operation ################################################################## train(sess, batch_input, batch_target, float_epoch, all_correct_val) # training set summary writer########################################################### if step % train_summary_period == 0: (train_loss, train_accuracy) = write_train_summary( sess, train_compute_nodes, batch_input, batch_target, float_epoch, all_correct_val, step) # Development loss evalution # After dev_summary_period batches are trained if (step % dev_summary_period == 0) or is_last_batch(i_batch): # dev set summary writer############################################################# dev_run_outputs = ( dev_loss_val, dev_accuracy_val, dev_op_wrong_val, per_digit_accuracy_val, per_digit_wrong_val) = write_dev_summary( sess, dev_compute_nodes, float_epoch, all_correct_val, step) # carry datasets summary writer ##################################################### if operator in config.operators_list(): carry_run_outputs = write_carry_datasets_summary( sess, dev_compute_nodes, float_epoch, all_correct_val, step) # TLU-dev summary writer############################################################# # tlu_on if tlu_on: dev_tlu_run_outputs = ( dev_loss_tlu_val, dev_accuracy_tlu_val, dev_op_wrong_tlu_val) = write_tlu_dev_summary( sess, dev_compute_nodes, float_epoch, all_correct_val, step) else: dev_tlu_run_outputs = None # Write running information################################ if operator in config.operators_list(): run_info = utils.write_run_info( run_info, float_epoch, dev_run_outputs, dev_tlu_run_outputs, carry_run_outputs) else: run_info = utils.write_run_info( run_info, float_epoch, dev_run_outputs, dev_tlu_run_outputs) # Write the logs of measures################################ #utils.write_measures(run_info, float_epoch, # dev_run_outputs, dev_tlu_run_outputs) #if is_last_batch(i_batch): # After one epoch is trained # Save the trained model ################################################ #model_saver.save(sess, '{}/dev-{}.ckpt'.format(dir_saved_model, run_id)) ##print("Model saved.") # decrease_dev_summary_period decrease_dev_summary_period(dev_accuracy_val, dev_op_wrong_val) # If there is no wrong operation, then ... all_correct_val = get_all_correct_val(dev_op_wrong_val) # If the model is trained with 100% accuracy, if all_correct_val and (not init_all_correct_model_saved): # Save the model. model_name = 'epoch{}-batch{}'.format( float_epoch, i_batch) #init_all_correct_model_saver.save(sess, '{}/{}-init-all-correct.ckpt'.format( # dir_saved_model, model_name)) #write_embeddings_summary(sess, h1) init_all_correct_model_saved = True if all_correct_val and all_correct_stop: break # Break the batch for-loop # End of one epoch if all_correct_val and all_correct_stop: break # Break the epoch for-loop # End of all epochs # Test loss evalution # Run computing test loss, accuracy # test set summary writer############################################################# (test_loss, test_accuracy, test_op_wrong_val) = write_test_summary(sess, test_compute_nodes, float_epoch, all_correct_val, step) #model_saver.save(sess, '{}/{}.ckpt'.format(dir_saved_model, run_id)) print("Model saved.") # Write running information################################ if operator in config.operators_list(): run_info = utils.write_run_info(run_info, float_epoch, dev_run_outputs, dev_tlu_run_outputs, carry_run_outputs, final=True) else: run_info = utils.write_run_info(run_info, float_epoch, dev_run_outputs, dev_tlu_run_outputs, final=True) #train_summary_writer.close() #dev_summary_writer.close() #if tlu_on: # tlu_summary_writer.close() #test_summary_writer.close() #if operator in config.operators_list(): # close_carry_datasets_summary_writers(carry_datasets_summary_writers) print("The training is over.")
def train(self, epoch): self.net.train() total_train_loss = 0 total_num = 0 total_train_correct = 0 cur_lr = cf.learning_rate(self.lr, epoch) self.optimizer = optim.SGD(self.net.parameters(), lr=cur_lr, momentum=0.9, weight_decay=5e-4) if self.show_log: print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cur_lr)) self.train_batch_loss_list = [] self.train_batch_acc_list = [] for batch_idx, (inputs, targets) in enumerate(self.train_loader): # inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = inputs.half().cuda(), targets.cuda() self.optimizer.zero_grad() outputs = self.net(inputs) # Forward Propagation loss = self.criterion(outputs, targets) # Loss loss.backward() # Backward Propagation self.optimizer.step() # Optimizer update # loss train_loss = loss.item() total_train_loss += train_loss self.train_batch_loss_list.append(train_loss) # accuracy _, predicted = torch.max(outputs.data, 1) total_num += targets.size(0) train_correct = predicted.eq(targets.data).cpu().sum().item() total_train_correct += train_correct train_acc = train_correct / targets.size(0) self.train_batch_acc_list.append(train_acc) # update visdom if self.show_vis: self.vis.line(np.array(self.train_batch_loss_list), X=np.arange(len(self.train_batch_loss_list)), win='train_batch_loss', opts={'title': 'train_batch_loss'}) self.vis.line(np.array(self.train_batch_acc_list), X=np.arange(len(self.train_batch_acc_list)), win='train_batch_acc', opts={'title': 'train_batch_acc'}) # update output if self.show_log: sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, self.num_epochs, batch_idx + 1, (len(self.train_set) // self.batch_size) + 1, train_loss, 100 * train_acc)) sys.stdout.flush() # total loss and accuracy epoch_average_train_loss = total_train_loss / len(self.train_loader) epoch_average_train_acc = total_train_correct / total_num self.train_epoch_loss_list.append(epoch_average_train_loss) self.train_epoch_acc_list.append(epoch_average_train_acc) # l2 norm of all parameters square_params_sum = 0 for param in self.net.parameters(): square_params_sum += (param**2).sum() params_l2_norm = (square_params_sum**0.5).item() self.params_l2_norm_list.append(params_l2_norm) # update visdom if self.show_vis: self.vis.line(np.array(self.train_epoch_loss_list), X=np.arange(len(self.train_epoch_loss_list)), win='train_epoch_loss', opts={'title': 'train_epoch_loss'}) self.vis.line(np.array(self.train_epoch_acc_list), X=np.arange(len(self.train_epoch_acc_list)), win='train_epoch_acc', opts={'title': 'train_epoch_acc'}) self.vis.line(np.array(self.params_l2_norm_list), X=np.arange(len(self.params_l2_norm_list)), win='params_l2_norm', opts={'title': 'params_l2_norm'})
def train(epoch): def gradi10(module): module[unimportant_channels["layer1.0"]] = 0 def gradi11(module): module[unimportant_channels["layer1.1"]] = 0 def gradi12(module): module[unimportant_channels["layer1.2"]] = 0 def gradi13(module): module[unimportant_channels["layer1.3"]] = 0 def gradi20(module): module[unimportant_channels["layer2.0"]] = 0 def gradi21(module): module[unimportant_channels["layer2.1"]] = 0 def gradi22(module): module[unimportant_channels["layer2.2"]] = 0 def gradi23(module): #print("23",module.shape) module[unimportant_channels["layer2.3"]] = 0 def gradi30(module): #print("30",module.shape) module[unimportant_channels["layer3.0"]] = 0 def gradi31(module): #print("31", module.shape) module[unimportant_channels["layer3.1"]] = 0 def gradi32(module): module[unimportant_channels["layer3.2"]] = 0 def gradi33(module): module[unimportant_channels["layer3.3"]] = 0 if use_cuda: net.module.layer1[0].conv1.weight.register_hook(gradi10) net.module.layer1[0].conv1.bias.register_hook(gradi10) net.module.layer1[0].bn2.weight.register_hook(gradi10) net.module.layer1[0].bn2.bias.register_hook(gradi10) net.module.layer1[1].conv1.weight.register_hook(gradi11) net.module.layer1[1].conv1.bias.register_hook(gradi11) net.module.layer1[1].bn2.weight.register_hook(gradi11) net.module.layer1[1].bn2.bias.register_hook(gradi11) net.module.layer1[2].conv1.weight.register_hook(gradi12) net.module.layer1[2].conv1.bias.register_hook(gradi12) net.module.layer1[2].bn2.weight.register_hook(gradi12) net.module.layer1[2].bn2.bias.register_hook(gradi12) net.module.layer1[3].conv1.weight.register_hook(gradi13) net.module.layer1[3].conv1.bias.register_hook(gradi13) net.module.layer1[3].bn2.weight.register_hook(gradi13) net.module.layer1[3].bn2.bias.register_hook(gradi13) net.module.layer2[0].conv1.weight.register_hook(gradi20) net.module.layer2[0].conv1.bias.register_hook(gradi20) net.module.layer2[0].bn2.weight.register_hook(gradi20) net.module.layer2[0].bn2.bias.register_hook(gradi20) net.module.layer2[1].conv1.weight.register_hook(gradi21) net.module.layer2[1].conv1.bias.register_hook(gradi21) net.module.layer2[1].bn2.weight.register_hook(gradi21) net.module.layer2[1].bn2.bias.register_hook(gradi21) net.module.layer2[2].conv1.weight.register_hook(gradi22) net.module.layer2[2].conv1.bias.register_hook(gradi22) net.module.layer2[2].bn2.weight.register_hook(gradi22) net.module.layer2[2].bn2.bias.register_hook(gradi22) net.module.layer2[3].conv1.weight.register_hook(gradi23) net.module.layer2[3].conv1.bias.register_hook(gradi23) net.module.layer2[3].bn2.weight.register_hook(gradi23) net.module.layer2[3].bn2.bias.register_hook(gradi23) net.module.layer3[0].conv1.weight.register_hook(gradi30) net.module.layer3[0].conv1.bias.register_hook(gradi30) net.module.layer3[0].bn2.weight.register_hook(gradi30) net.module.layer3[0].bn2.bias.register_hook(gradi30) net.module.layer3[1].conv1.weight.register_hook(gradi31) net.module.layer3[1].conv1.bias.register_hook(gradi31) net.module.layer3[1].bn2.weight.register_hook(gradi31) net.module.layer3[1].bn2.bias.register_hook(gradi31) net.module.layer3[2].conv1.weight.register_hook(gradi32) net.module.layer3[2].conv1.bias.register_hook(gradi32) net.module.layer3[2].bn2.weight.register_hook(gradi32) net.module.layer3[2].bn2.bias.register_hook(gradi32) net.module.layer3[3].conv1.weight.register_hook(gradi33) net.module.layer3[3].conv1.bias.register_hook(gradi33) net.module.layer3[3].bn2.weight.register_hook(gradi33) net.module.layer3[3].bn2.bias.register_hook(gradi33) else: net.layer1[0].conv1.weight.register_hook(gradi10) net.layer1[0].conv1.bias.register_hook(gradi10) net.layer1[0].bn2.weight.register_hook(gradi10) net.layer1[0].bn2.bias.register_hook(gradi10) net.layer1[1].conv1.weight.register_hook(gradi11) net.layer1[1].conv1.bias.register_hook(gradi11) net.layer1[1].bn2.weight.register_hook(gradi11) net.layer1[1].bn2.bias.register_hook(gradi11) net.layer1[2].conv1.weight.register_hook(gradi12) net.layer1[2].conv1.bias.register_hook(gradi12) net.layer1[2].bn2.weight.register_hook(gradi12) net.layer1[2].bn2.bias.register_hook(gradi12) net.layer1[3].conv1.weight.register_hook(gradi13) net.layer1[3].conv1.bias.register_hook(gradi13) net.layer1[3].bn2.weight.register_hook(gradi13) net.layer1[3].bn2.bias.register_hook(gradi13) net.layer2[0].conv1.weight.register_hook(gradi20) net.layer2[0].conv1.bias.register_hook(gradi20) net.layer2[0].bn2.weight.register_hook(gradi20) net.layer2[0].bn2.bias.register_hook(gradi20) net.layer2[1].conv1.weight.register_hook(gradi21) net.layer2[1].conv1.bias.register_hook(gradi21) net.layer2[1].bn2.weight.register_hook(gradi21) net.layer2[1].bn2.bias.register_hook(gradi21) net.layer2[2].conv1.weight.register_hook(gradi22) net.layer2[2].conv1.bias.register_hook(gradi22) net.layer2[2].bn2.weight.register_hook(gradi22) net.layer2[2].bn2.bias.register_hook(gradi22) net.layer2[3].conv1.weight.register_hook(gradi23) net.layer2[3].conv1.bias.register_hook(gradi23) net.layer2[3].bn2.weight.register_hook(gradi23) net.layer2[3].bn2.bias.register_hook(gradi23) net.layer3[0].conv1.weight.register_hook(gradi30) net.layer3[0].conv1.bias.register_hook(gradi30) net.layer3[0].bn2.weight.register_hook(gradi30) net.layer3[0].bn2.bias.register_hook(gradi30) net.layer3[1].conv1.weight.register_hook(gradi31) net.layer3[1].conv1.bias.register_hook(gradi31) net.layer3[1].bn2.weight.register_hook(gradi31) net.layer3[1].bn2.bias.register_hook(gradi31) net.layer3[2].conv1.weight.register_hook(gradi32) net.layer3[2].conv1.bias.register_hook(gradi32) net.layer3[2].bn2.weight.register_hook(gradi32) net.layer3[2].bn2.bias.register_hook(gradi32) net.layer3[3].conv1.weight.register_hook(gradi33) net.layer3[3].conv1.bias.register_hook(gradi33) net.layer3[3].bn2.weight.register_hook(gradi33) net.layer3[3].bn2.bias.register_hook(gradi33) net.train() net.training = True train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update #print(net.layer1[0].conv1.weight) train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.item(), 100. * correct / total)) sys.stdout.flush()
def train(net, dataloader, optimizer, epoch, num_classes): criterion = nn.CrossEntropyLoss() net.train() hard_loss_sum = 0 soft_loss_sum = 0 loss_sum = 0 correct = 0 total = 0 print('\n=> [%s] Training Epoch #%d, lr=%.4f' % (model_name, epoch, cf.learning_rate(args.lr, epoch))) log_file.write('\n=> [%s] Training Epoch #%d, lr=%.4f\n' % (model_name, epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(dataloader): inputs = inputs.to(device) targets = targets.to(device) # obtain soft_target by forwarding data in test mode if epoch >= args.distill_from and args.distill > 0: soft_target = F.softmax( torch.randn(inputs.size(0), num_classes, device=device) * args.rand_std, dim=1) net.train() optimizer.zero_grad() outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss hard_loss_sum = hard_loss_sum + loss.item() * targets.size(0) # compute distillation loss if epoch >= args.distill_from and args.distill > 0: heat_output = outputs heat_soft_target = soft_target distill_loss = F.kl_div(F.log_softmax(heat_output, 1), F.softmax(heat_soft_target), size_average=False) / targets.size(0) soft_loss_sum = soft_loss_sum + distill_loss.item() * targets.size( 0) distill_loss = distill_loss loss = loss + args.distill * distill_loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update loss_sum = loss_sum + loss.item() * targets.size(0) _, predicted = torch.max(outputs.detach(), 1) total += targets.size(0) correct += predicted.eq(targets.detach()).long().sum().item() if math.isnan(loss.item()): print('@@@@@@@nan@@@@@@@@@@@@') log_file.write('@@@@@@@@@@@nan @@@@@@@@@@@@@\n') sys.exit(0) sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\tLoss: %.4g Acc@1: %.2f%% Hard: %.4g Soft: %.4g' % (epoch, args.num_epochs, batch_idx + 1, (len(trainset) // args.bs) + 1, loss_sum / total, 100. * correct / total, hard_loss_sum / total, soft_loss_sum / total)) sys.stdout.flush() log_file.write( '| Epoch [%3d/%3d] \tLoss: %.4f Acc@1: %.2f%% Hard: %.4f Soft: %.4f' % (epoch, args.num_epochs, loss_sum / total, 100. * correct / total, hard_loss_sum / total, soft_loss_sum / total))
net = VGGNetDropConnect(num_classes, args.drop_p, args.drop_last_only, args.feat_dim) else: print('Error : Network should be either [ResNet34]') sys.exit(0) net.init_weights() net.to(device) # Training print('\n[Phase 3] : Training model') print('| Training Epochs = ' + str(args.num_epochs)) print('| Initial Learning Rate = ' + str(args.lr)) optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, 1), momentum=0.9, weight_decay=args.wd) elapsed_time = 0 for epoch in range(1, args.num_epochs + 1): start_time = time.time() set_learning_rate(optimizer, cf.learning_rate(args.lr, epoch)) train(net, trainloader, optimizer, epoch) test(net, testloader, epoch) epoch_time = time.time() - start_time elapsed_time += epoch_time print('| Elapsed time : %d:%02d:%02d' % (cf.get_hms(elapsed_time))) log_file.write('| Elapsed time : %d:%02d:%02d\n' % (cf.get_hms(elapsed_time)))
def train(epoch): net.train() fake = netG(fixed_noise) torchvision.utils.save_image(fake.data, '%s/gan_samples_epoch_%03d.png' % (args.out_folder, epoch), normalize=True) train_loss = 0 entropy_loss = 0 correct = 0 total = 0 temp1_accum = 0 temp2_accum = 0 sigmoid_sum_loss = 0 sharing_node_loss = 0 num_classes = args.num_classes # optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), nesterov=True, momentum=0.9, weight_decay=5e-4) optimizer = optim.Adam(net.parameters(), lr=cf.learning_rate(args.lr, epoch), betas=(0.5, 0.999), weight_decay=5e-4) if args.gan: # optimizerD = optim.SGD(netD.parameters(), lr=cf.learning_rate(1e-5, epoch), nesterov=True, momentum=0.9, weight_decay=5e-4) # optimizerG = optim.SGD(netG.parameters(), lr=cf.learning_rate(1e-3, epoch), nesterov=True, momentum=0.9, weight_decay=5e-4) optimizerD = optim.Adam(netD.parameters(), lr=cf.learning_rate(1e-6, epoch), betas=(0.5, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=cf.learning_rate(2e-5, epoch), betas=(0.5, 0.999)) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings inputs, targets = Variable(inputs), Variable(targets) optimizer.zero_grad() outputs = net(inputs) if args.gan: gan_target = torch.FloatTensor(targets.size()).fill_(0) uniform_dist = torch.Tensor(inputs.size(0), args.num_classes).fill_( (1. / args.num_classes)) uniform_dist = Variable(uniform_dist) if use_cuda: gan_target, uniform_dist = gan_target.cuda( ), uniform_dist.cuda() ########################### # (1) Update D network # ########################### # train with real gan_target.fill_(real_label) targetv = Variable(gan_target) optimizerD.zero_grad() output = netD(inputs) errD_real = gan_criterion(output, targetv) errD_real.backward() D_x = output.data.mean() # train with fake noise = torch.FloatTensor(inputs.size(0), nz, 1, 1).normal_(0, 1).cuda() if use_cuda: noise = noise.cuda() noise = Variable(noise) fake = netG(noise) targetv = Variable(gan_target.fill_(fake_label)) output = netD(fake.detach()) errD_fake = 1.0 * gan_criterion(output, targetv) errD_fake.backward() D_G_z1 = output.data.mean() errD = errD_real + errD_fake optimizerD.step() ########################### # (2) Update G network # ########################### optimizerG.zero_grad() # Original GAN loss targetv = Variable(gan_target.fill_(real_label)) output = netD(fake) errG = 1.0 * gan_criterion(output, targetv) D_G_z2 = output.data.mean() # minimize the true distribution KL_fake_output = F.log_softmax(net(fake)[:, :num_classes]) errG_KL = F.kl_div(KL_fake_output, uniform_dist) * args.num_classes # targetv = Variable(gan_target.fill_(fake_label)) # KL_fake_output = F.sigmoid(net(fake)[:,:num_classes]) # errG_KL = criterion2(KL_fake_output, targetv) generator_loss = errG + 1.0 * errG_KL generator_loss.backward() optimizerG.step() # KL divergence noise = torch.FloatTensor(inputs.size(0), nz, 1, 1).normal_(0, 1).cuda() if use_cuda: noise = noise.cuda() noise = Variable(noise) fake = netG(noise) # KL_fake_output = F.sigmoid(net(fake)[:,:num_classes]) # KL_loss_fake = 1.0 * criterion2(KL_fake_output, )*args.num_classes KL_fake_output = F.log_softmax(net(fake)[:, :num_classes]) KL_loss_fake = 1.0 * F.kl_div(KL_fake_output, uniform_dist) * args.num_classes background_node = outputs[:, -1] fake_node_bce_loss = args.fake_node_bce_beta * criterion2( F.sigmoid(background_node), targetv) # Forward Propagation num_sampling = num_classes if args.loss == 'ce' and args.sampling_rate != 1.: num_sampling = int(num_classes * args.sampling_rate) full_output = outputs outputs, targets = sampling_for_loss(outputs, targets, num_sampling, num_classes=num_classes, sharing=False) loss = criterion(outputs[:, :num_sampling], targets) # Loss ce_loss = torch.zeros((1)).cuda() unknown_node_loss = torch.zeros((1)).cuda() temp1_accum = loss.detach().cpu() * ( 1. / (batch_idx + 1.)) + temp1_accum * (batch_idx / (batch_idx + 1.)) if args.loss == 'bce': loss = torch.zeros(1).cuda() if 'bce' in args.loss: # if num_classes > 30 : # num_sampling = int(num_classes * args.sampling_rate) # outputs, targets = sampling_for_loss(outputs, targets, num_sampling) # new_bce_targets = target_transform_for_elementwise_bce(targets, num_sampling) # temp2 = args.bce_scale * criterion2(F.sigmoid(outputs[:,:num_sampling]), new_bce_targets).cuda() # else: num_sampling = int(num_classes * args.sampling_rate) if args.sampling_rate != 1.: full_output = outputs outputs, targets = sampling_for_loss(outputs, targets, num_sampling, num_classes=num_classes) new_bce_targets = target_transform_for_elementwise_bce( targets, num_sampling).cuda() temp2 = args.bce_scale * criterion2( F.sigmoid(outputs[:, :num_sampling]), new_bce_targets) else: if args.gan: # if False: bce_targets = target_transform_for_elementwise_bce( targets, num_classes + 1).cuda() temp2 = args.bce_scale * criterion2( F.sigmoid(outputs), bce_targets).cuda() else: bce_targets = target_transform_for_elementwise_bce( targets, num_classes).cuda() temp2 = args.bce_scale * criterion2( F.sigmoid(outputs[:, :num_classes]), bce_targets).cuda() temp2_accum = temp2.detach().cpu() * ( 1. / (batch_idx + 1.)) + temp2_accum * (batch_idx / (batch_idx + 1.)) if args.sigmoid_sum is not None: # sigmoid_sum = torch.sum(F.sigmoid(full_output[:,:num_classes]), dim=1) sigmoid_sum = torch.sum(F.sigmoid( full_output[:, :num_sampling]), dim=1) sigmoid_sum_loss = F.mse_loss( sigmoid_sum, args.sigmoid_sum * torch.ones_like(sigmoid_sum)) loss += 0.5 * sigmoid_sum_loss loss += temp2 if args.gan: loss += (KL_loss_fake + fake_node_bce_loss) entropy_loss = args.ent * entropy(outputs) if args.sharing is not None: classifictaion_target = Variable( torch.zeros(targets.size(0)).long()).cuda() output_target_select = outputs[:, :num_sampling].gather( dim=1, index=targets.unsqueeze(1)) output_target_sharing_concat = torch.cat( (output_target_select.view( -1, 1), outputs[:, num_sampling].view(-1, 1)), 1) ce_loss = args.sharing * F.cross_entropy( F.softmax(output_target_sharing_concat), classifictaion_target) mask = torch.ones(outputs[:, :num_sampling].size()).byte().cuda() for i in range(mask.size(0)): mask[i, targets[i]] = torch.zeros(1) output_for_entropy_except_target_node = torch.masked_select( outputs[:, :num_sampling], mask) output_for_entropy_except_target_node = output_for_entropy_except_target_node.view( outputs[:, :num_sampling].size(0), -1) output_for_entropy_except_target_node = \ torch.cat((output_for_entropy_except_target_node.view(targets.size(0),-1,1), outputs[:,num_sampling].view(-1,1,1).expand(targets.size(0),num_sampling-1,1)),2) # entropy_loss = args.ent * entropy(output_for_entropy_except_target_node) entropy_loss = args.ent * sharing_entropy( output_for_entropy_except_target_node) loss += ce_loss + entropy_loss # if args.ent != 0 and args.sharing is None: # loss += entropy_loss train_loss += loss.item() max_logit, predicted = torch.max(outputs[:, :num_sampling].data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() if args.unknown_is_True != False: # weight = 0.01 * epoch weight = 0.1 select1 = (F.sigmoid(max_logit) < 0.5) select2 = (F.sigmoid(outputs[:, targets]) < 0.1) select = (select1 + select2) >= 1 if args.sepa_unknown_sharing: output_gather = F.sigmoid(outputs[:, num_classes + 1].masked_select(select)) else: output_gather = F.sigmoid( outputs[:, num_classes].masked_select(select)) node_target = torch.ones(output_gather.size()).cuda() if output_gather.size(0) > 0: # if weight >= 0.1 : # weight = 0.1 sharing_node_loss = weight * criterion2( output_gather, node_target) loss += sharing_node_loss if args.sepa_unknown_sharing: select_unknown = (select1 + select2) < 1 unknown_output_gather = F.sigmoid( outputs[:, num_classes + 1].masked_select(select_unknown)) if unknown_output_gather.size(0) > 0: unknown_node_target = torch.zeros( unknown_output_gather.size()).cuda() unknown_node_loss = weight * criterion2( unknown_output_gather, unknown_node_target) loss += unknown_node_loss else: pass # for i in range(outputs.size(0)): loss.backward() # Backward Propagation optimizer.step() # Optimizer update sys.stdout.write('\r') if args.gan: sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tD_x:%.2f D_G_z1:%.2f D_G_z2:%.2f BCE : %.4f Ent_los : %.4f Sha_los : %.4f node_loss : %.4f kl_fake : %.4f Acc@1: %.3f%%' %(epoch, num_epochs, batch_idx+1,(len(trainset)//batch_size)+1, \ D_x, D_G_z1, D_G_z2, temp2_accum, entropy_loss, ce_loss, fake_node_bce_loss.item(), KL_loss_fake.item(), float(100.00*float(correct)/float(total)))) # if not args.sepa_unknown_sharing : # if args.sigmoid_sum is not None : # sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f CE_loss : %.4f BCE_loss : %.4f Ent_loss : %.4f Sha_loss : %.4f node_loss : %.4f sum_loss : %.4f Acc@1: %.3f%%' # %(epoch, num_epochs, batch_idx+1, # (len(trainset)//batch_size)+1, loss.item(), temp1_accum, temp2_accum, entropy_loss, ce_loss, sharing_node_loss, sigmoid_sum_loss, float(100.00*float(correct)/float(total)))) # else : # sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f CE_loss : %.4f BCE_loss : %.4f Ent_loss : %.4f Sha_loss : %.4f node_loss : %.4f Acc@1: %.3f%%' # %(epoch, num_epochs, batch_idx+1, # (len(trainset)//batch_size)+1, loss.item(), temp1_accum, temp2_accum, entropy_loss, ce_loss, sharing_node_loss, float(100.00*float(correct)/float(total)))) # else : # sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f CE_loss : %.4f BCE_loss : %.4f Ent_loss : %.4f \ # Sha_loss : %.4f node_loss : %.4f unknown_loss : %.4f Acc@1: %.3f%%' # %(epoch, num_epochs, batch_idx+1,(len(trainset)//batch_size)+1, # loss.item(), temp1_accum, temp2_accum, entropy_loss, ce_loss, \ # sharing_node_loss, unknown_node_loss,float(100.00*float(correct)/float(total)))) sys.stdout.flush()
def train(epoch): compare_grad_vs_approx = False net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) if compare_grad_vs_approx == True: num_approx_layers = len(get_approx_layers(net)) avg_mean = torch.zeros(num_approx_layers, len(trainloader)) avg_mse = torch.zeros(num_approx_layers, len(trainloader)) avg_std = torch.zeros(num_approx_layers, len(trainloader)) max_diff = torch.zeros(num_approx_layers, len(trainloader)) print('\n=> Training Epoch #%d, LR=%.4f' %(epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): #with torch.autograd.profiler.profile(use_cuda=True) as prof: if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings if compare_grad_vs_approx == True: # get gradients with non-approximate calculations: acc_grads = [] for layer in get_approx_layers(net): layer.eval() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation for layer in get_approx_layers(net): for n,p in layer.named_parameters(): if ('weight' in n): acc_grads.append(p.grad.clone()) approx_grads = [] net.train() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation for layer in get_approx_layers(net): for n,p in layer.named_parameters(): if ('weight' in n): approx_grads.append(p.grad.clone()) optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\n') sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.2f%%' %(epoch, num_epochs, batch_idx+1, (len(trainset)//batch_size)+1, loss.item(), 100.*correct.float()/total)) #print('approx_grads:') #print (approx_grads) #print ('acc_grads') #print (acc_grads) #print('mean {}'.format(torch.mean(avg_mean,dim=1))) #print('relative MSE {}'.format(torch.mean(avg_mse,dim=1))) #print('std {}'.format(torch.mean(avg_std,dim=1))) for i, (approx_grad,acc_grad) in enumerate(zip(approx_grads,acc_grads)): #print('index {}'.format(i)) #print('mean {}'.format((approx_grad-acc_grad).flatten().mean())) avg_mean[i,batch_idx] = (approx_grad-acc_grad).flatten().mean() #print('relative MSE {}'.format((approx_grad-acc_grad).norm()/acc_grad.norm())) avg_mse[i,batch_idx] = (approx_grad-acc_grad).norm()/acc_grad.norm() #print('std {}'.format((approx_grad-acc_grad).flatten().std())) avg_std[i,batch_idx] = (approx_grad-acc_grad).flatten().std() #print('max diff {}'.format((approx_grad-acc_grad).norm(p=float('inf')))) max_diff[i,batch_idx] = (approx_grad-acc_grad).norm(p=float('inf')) sys.stdout.flush() else: optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation optimizer.step() # Optimizer update train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\n') sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.2f%%' %(epoch, num_epochs, batch_idx+1, (len(trainset)//batch_size)+1, loss.item(), 100.*correct.float()/total)) sys.stdout.flush() #print(batch_idx) #print(prof.key_averages()) #exit() #print(net.module.layer1[1].conv1.weight[0][0]) if compare_grad_vs_approx == True: torch.set_printoptions(linewidth=100000) print() print('mean {}'.format(torch.mean(avg_mean,dim=1))) print('relative MSE {}'.format(torch.mean(avg_mse,dim=1))) print('std {}'.format(torch.mean(avg_std,dim=1))) print('max diff {}'.format(torch.norm(max_diff,p=float('inf'),dim=1))) print('avg max diff {}'.format(torch.mean(max_diff,dim=1))) torch.set_printoptions(profile="default")
print('| Building net type [' + args.net + ']...') if args.net == 'vgg16': net = VGGNet(num_classes, args.drop_p, False, args.feat_dim, args.conv == 5) else: print('Error : Network should be either [ResNet34]') sys.exit(0) net.init_weights() net.to(device) # Training print('\n[Phase 3] : Training model') print('| Training Epochs = ' + str(args.num_epochs)) print('| Initial Learning Rate = ' + str(args.lr)) optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, 1), momentum=0.9, weight_decay=args.wd) elapsed_time = 0 for epoch in range(1, args.num_epochs + 1): start_time = time.time() set_learning_rate(optimizer, cf.learning_rate(args.lr, epoch)) train(net, trainloader, optimizer, epoch) test(net, testloader, epoch) epoch_time = time.time() - start_time elapsed_time += epoch_time print('| Elapsed time : %d:%02d:%02d' %(cf.get_hms(elapsed_time))) log_file.write('| Elapsed time : %d:%02d:%02d\n' %(cf.get_hms(elapsed_time))) log_file.flush() print('\n[Phase 4] : Testing model')
def train(self): elapsed_time = 0 for self.curr_epoch in range(1, self.num_epochs + 1): self.model.train() self.model.training = True self.optimizer = optim.SGD(self.model.parameters(), lr=cf.learning_rate( self.learning_rate, self.curr_epoch), momentum=0.9, weight_decay=5e-4) train_loss = 0 train_correct = 0 total = 0 time_start = time.time() print('\n=> Training Epoch #%d, LR=%.4f' % (self.curr_epoch, cf.learning_rate(self.learning_rate, self.curr_epoch))) for self.curr_batch, (x, y) in enumerate(self.train_loader): x, y = x.to(self.device), y.to(self.device) # perturb data during noisy training if self.training_type == 'noisy': x = self.adversary.perturb(x, self.device, self.variance) x, y = Variable(x), Variable(y) self.optimizer.zero_grad() outputs = self.model(x) total += y.size(0) loss = self.criterion(outputs, y) train_loss += loss _, pred = torch.max(outputs.data, 1) train_correct += pred.eq(y.data).cpu().sum() loss.backward() self.optimizer.step() # add training on adversarial perturbation during adv training if self.training_type == 'adversarial': delta = self.adversary.get_adversarial_examples( self.model, x, y).to(self.device) x, y = x.to(self.device), y.to(self.device) x, y = Variable(x), Variable(y) outcome = self.model(x + delta) _, pred = torch.max(outcome.data, 1) train_correct += pred.eq(y.data).cpu().sum() total += y.size(0) loss = self.criterion(outcome, y) train_loss += loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (self.curr_epoch, self.num_epochs, self.curr_batch, (len(self.train_dataset) // self.train_batch_size) + 1, train_loss.item(), 100. * train_correct / total)) sys.stdout.flush() train_acc = 100. * train_correct / total with torch.no_grad(): # testing self.model.eval() self.training = False test_loss = 0. test_correct = 0 total = 0 for self.curr_batch, (x, y) in enumerate(self.test_loader): x_var, y_var = Variable(x), Variable(y) x_var, y_var = x_var.to(self.device), y_var.to(self.device) outcome = self.model(x_var) loss = self.criterion(outcome, y_var) test_loss += loss _, pred = torch.max(outcome.data, 1) test_correct += pred.eq(y_var.data).cpu().sum() total += y_var.size(0) test_acc = 100. * test_correct / total print( "\n| Validation Epoch #%d\t\t\tLoss: %.4f Acc@1: %.2f%%" % (self.curr_epoch, test_loss.item(), test_acc)) time_epoch = time.time() - time_start elapsed_time += time_epoch print('| Elapsed time : %d:%02d:%02d' % (cf.get_hms(elapsed_time))) self.write_tb(train_loss.item(), train_acc, test_loss.item(), test_acc)
def main(): # enable eager execution global optimizer optimizer = tf.keras.optimizers.SGD(config.learning_rate()) print('Loading and preparing data', end='') (images, labels), (t_images, t_labels) = tf.keras.datasets.mnist.load_data() images = np.asarray(images[:config.train_n()]).astype( np.float32) * 2.0 / 255.0 - 1.0 t_images = np.asarray(t_images[:config.test_n()]).astype( np.float32) * 2.0 / 255.0 - 1.0 labels = np.asarray(labels[:config.train_n()]) t_labels = np.asarray(t_labels[:config.test_n()]) # images = np.concatenate((images, np.zeros(list(np.shape(images)[: -1]) + [2], np.float32)), axis = -1) # t_images = np.concatenate((t_images, np.zeros(list(np.shape(t_images)[: -1]) + [2], np.float32)), axis = -1) print('\rGenerating random walks for creating time-series data', end='') print( '\rData Loaded' + ' ' ) model = network.network(config.window_size()**2 + 2, config.lstm_layers(), config.fc_layers()) positions = [] moves = [] for _ in range(config.no_walks()): temp = get_random_walk(28, 28, max_length=config.time_steps(), window_size=config.window_size()) positions.append(temp[0]) moves.append(temp[1]) moves = np.asarray(moves) positions = np.asarray(positions) for epoch in range(config.epochs()): test_results = {'loss': 0, 'success': 0} train_results = {'loss': 0, 'success': 0} # train for L in tqdm(range(0, len(images) - len(images) % config.batch_size(), config.batch_size()), desc='Epoch {}: Training'.format(epoch)): mini_batch_images = images[L:L + config.batch_size()] mini_batch_labels = labels[L:L + config.batch_size()] random_indices = np.random.randint(0, config.no_walks(), [config.batch_size()]) mini_batch_moves = moves[random_indices] mini_batch_positions = positions[random_indices] train(mini_batch_images, mini_batch_moves, mini_batch_positions, mini_batch_labels, model) # test over training set sys.stdout.write("\033[F") sys.stdout.write("\033[K") sys.stdout.flush() for L in tqdm(range(0, len(images) - len(images) % config.batch_size(), config.batch_size()), desc='Epoch {}: Testing train_set'.format(epoch)): mini_batch_images = images[L:L + config.batch_size()] mini_batch_labels = labels[L:L + config.batch_size()] random_indices = np.random.randint(0, config.train_random_walks(), [config.batch_size()]) mini_batch_moves = moves[random_indices] mini_batch_positions = positions[random_indices] # train(mini_batch_images, mini_batch_moves, mini_batch_positions, mini_batch_labels, model) training_set_results = forward_prop(mini_batch_images, mini_batch_moves, mini_batch_positions, mini_batch_labels, model) for key in train_results: train_results[key] += training_set_results[key] train_results['success'] = (train_results['success'] * 100.0 / len(images)).numpy().round(2) train_results['loss'] = ( train_results['loss'] / (len(images) // config.batch_size())).numpy().round(6) # test over test set sys.stdout.write("\033[F") sys.stdout.write("\033[K") sys.stdout.flush() for L in tqdm(range( 0, len(t_images) - len(t_images) % config.batch_size(), config.batch_size()), desc='Epoch {}: Testing test_set'.format(epoch)): mini_batch_images = t_images[L:L + config.batch_size()] mini_batch_labels = t_labels[L:L + config.batch_size()] random_indices = np.random.randint(config.train_random_walks(), config.no_walks(), [config.batch_size()]) mini_batch_moves = moves[random_indices] mini_batch_positions = positions[random_indices] # train(mini_batch_images, mini_batch_moves, mini_batch_positions, mini_batch_labels, model) test_set_results = forward_prop(mini_batch_images, mini_batch_moves, mini_batch_positions, mini_batch_labels, model) for key in test_results: test_results[key] += test_set_results[key] test_results['success'] = (test_results['success'] * 100.0 / len(t_images)).numpy().round(2) test_results['loss'] = ( test_results['loss'] / (len(t_images) // config.batch_size())).numpy().round(6) sys.stdout.write("\033[F") sys.stdout.write("\033[K") print('Epoch {:5d}: '.format(epoch), train_results, test_results) save_weights(model.trainable_variables, epoch, test_results['success'], './weights')
def train(epoch): global quan_cor net.train() train_loss = 0 correct = 0 total = 0 optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4) if epoch > 100: optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * 0.1 print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs_value, targets) in enumerate(trainloader): if use_cuda: inputs_value, targets = inputs_value.cuda(), targets.cuda( ) # GPU settings optimizer.zero_grad() inputs_value, targets = Variable(inputs_value), Variable(targets) outputs = net.forward(inputs_value) # Forward Propagation loss = criterion(outputs, targets) # Loss loss.backward() # Backward Propagation '''lenet #layer1 flag = net.index1 grad_value=torch.zeros((M,)).cuda() for m in range(M): inx=torch.zeros(flag.size()).cuda() inx = torch.where((flag !=(m+1)),inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.conv1.weight.grad*inx).data.clamp_(-1,1) grad_weight_ste = torch.where((abs(grad_weight_ste)!=1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m]= torch.sum(grad_weight_ste) net.levels1.grad=grad_value.cuda() #layer2 flag = net.index2 grad_value=torch.zeros((M,)).cuda() for m in range(M): inx=torch.zeros(flag.size()).cuda() inx = torch.where((flag !=(m+1)),inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.conv2.weight.grad*inx).data.clamp_(-1,1) grad_weight_ste = torch.where((abs(grad_weight_ste)!=1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m]= torch.sum(grad_weight_ste) net.levels2.grad=grad_value.cuda() #layer 3 flag = net.index3 grad_value=torch.zeros((M,)).cuda() for m in range(M): inx=torch.zeros(flag.size()).cuda() inx = torch.where((flag !=(m+1)),inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.fc1.weight.grad*inx).data.clamp_(-1,1) grad_weight_ste = torch.where((abs(grad_weight_ste)!=1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m]= torch.sum(grad_weight_ste) net.levels3.grad=grad_value.cuda() #layer 4 flag = net.index4 grad_value=torch.zeros((M,)).cuda() for m in range(M): inx=torch.zeros(flag.size()).cuda() inx = torch.where((flag !=(m+1)),inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.fc2.weight.grad*inx).data.clamp_(-1,1) grad_weight_ste = torch.where((abs(grad_weight_ste)!=1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m]= torch.sum(grad_weight_ste) net.levels4.grad=grad_value.cuda() # layer5 flag = net.index5 grad_value=torch.zeros((M,)).cuda() for m in range(M): inx=torch.zeros(flag.size()).cuda() inx = torch.where((flag !=(m+1)),inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.fc3.weight.grad*inx).data.clamp_(-1,1) # print(grad_weight_ste[0,:]) grad_weight_ste = torch.where((abs(grad_weight_ste)!=1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m]= torch.sum(grad_weight_ste) net.levels5.grad=grad_value.cuda() ''' #layer1 flag = net.index1 grad_value = torch.zeros((M, )).cuda() for m in range(M): inx = torch.zeros(flag.size()).cuda() inx = torch.where((flag != (m + 1)), inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.fc1.weight.grad * inx).data.clamp_(-1, 1) grad_weight_ste = torch.where((abs(grad_weight_ste) != 1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m] = torch.sum(grad_weight_ste) net.levels1.grad = grad_value.cuda() #layer2 flag = net.index2 grad_value = torch.zeros((M, )).cuda() for m in range(M): inx = torch.zeros(flag.size()).cuda() inx = torch.where((flag != (m + 1)), inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.fc2.weight.grad * inx).data.clamp_(-1, 1) grad_weight_ste = torch.where((abs(grad_weight_ste) != 1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m] = torch.sum(grad_weight_ste) net.levels2.grad = grad_value.cuda() #layer 3 flag = net.index3 grad_value = torch.zeros((M, )).cuda() for m in range(M): inx = torch.zeros(flag.size()).cuda() inx = torch.where((flag != (m + 1)), inx, torch.Tensor([1]).cuda()) grad_weight_ste = (net.fc3.weight.grad * inx).data.clamp_(-1, 1) grad_weight_ste = torch.where((abs(grad_weight_ste) != 1), grad_weight_ste, torch.Tensor([0]).cuda()) grad_value[m] = torch.sum(grad_weight_ste) net.levels3.grad = grad_value.cuda() # for p in list(net.parameters()): # if hasattr(p,'org'): # p.data.copy_(p.org) # net.fc1.weight.requires_grad = False # net.fc2.weight.requires_grad = False # net.fc3.weight.requires_grad = False optimizer.step() # Optimizer update sort, _ = torch.sort(net.levels1.data) net.levels1.data = sort.cuda() lev = torch.sum(sort) * 0.5 net.partitions1.data[0] = lev sort, _ = torch.sort(net.levels2.data) net.levels2.data = sort.cuda() lev = torch.sum(sort) * 0.5 net.partitions2.data[0] = lev sort, _ = torch.sort(net.levels3.data) net.levels3.data = sort.cuda() lev = torch.sum(sort) * 0.5 net.partitions3.data[0] = lev # sort, _ = torch.sort(net.levels4.data) # net.levels4.data = sort.cuda() # sort, _ = torch.sort(net.levels5.data) # net.levels5.data = sort.cuda() train_loss += loss.data _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.data, 100. * correct / total)) sys.stdout.flush() acc = correct diagnostics_to_write = { 'Epoch': epoch, 'Loss': loss.data, 'Accuracy': 100 * correct / total } with open(logfile, 'a') as lf: lf.write(str(diagnostics_to_write)) if correct >= quan_cor: quan_cor = correct print('Top1:', quan_cor, '%') np.savez('nonbayes_param' + str(M) + '.npz', par0=net.partitions1.data.cpu(), lev0=net.levels1.data.cpu(), par1=net.partitions2.data.cpu(), lev1=net.levels2.data.cpu(), par2=net.partitions3.data.cpu(), lev2=net.levels3.data.cpu(), par3=net.partitions4.data.cpu(), lev3=net.levels4.data.cpu(), par4=net.partitions5.data.cpu(), lev4=net.levels5.data.cpu()) state = { 'net': net if use_cuda else net, 'correct': correct, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') save_point = './checkpoint/nonBayes_quan' + args.dataset + os.sep if not os.path.isdir(save_point): os.mkdir(save_point) torch.save(state, save_point + file_name + str(M) + '.t7') print('\n') print('layer1\t', net.levels1.data) print('layer2\t', net.levels2.data) print('layer3\t', net.levels3.data) print('layer4\t', net.levels4.data) print('layer5\t', net.levels5.data)
def train(epoch): net.train() train_loss = 0 entropy_loss = 0 correct = 0 total = 0 temp1_accum = 0 temp2_accum = 0 sigmoid_sum_loss = 0 sharing_node_loss = 0 ce_loss = torch.zeros((1)).cuda() entropy_loss = torch.zeros((1)).cuda() num_classes = args.num_classes optimizer = optim.SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), nesterov=True, momentum=0.9, weight_decay=5e-4) # optimizer = optim.Adam(net.parameters(), lr=cf.learning_rate(args.lr, epoch), betas=(0.5,0.999), weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation num_sampling = num_classes if args.loss == 'ce' and args.sampling_rate != 1.: num_sampling = int(num_classes * args.sampling_rate) full_output = outputs outputs, targets = sampling_for_loss(outputs, targets, num_sampling, num_classes=num_classes, sharing=False) loss = criterion_CE(outputs[:, :num_sampling], targets) # Loss if args.loss == 'ce' and args.ce_entropy != 0.: entropy_loss = args.ce_entropy * entropy(outputs) loss += entropy_loss unknown_node_loss = torch.zeros((1)).cuda() temp1_accum = loss.detach().cpu() * ( 1. / (batch_idx + 1.)) + temp1_accum * (batch_idx / (batch_idx + 1.)) if args.loss == 'bce': loss = torch.zeros(1).cuda() if 'bce' in args.loss: # if num_classes > 30 : # num_sampling = int(num_classes * args.sampling_rate) # outputs, targets = sampling_for_loss(outputs, targets, num_sampling) # new_bce_targets = target_transform_for_elementwise_bce(targets, num_sampling) # temp2 = args.bce_scale * criterion_BCE(F.sigmoid(outputs[:,:num_sampling]), new_bce_targets).cuda() # else: num_sampling = int(num_classes * args.sampling_rate) if args.sampling_rate != 1.: full_output = outputs outputs, targets = sampling_for_loss(outputs, targets, num_sampling, num_classes=num_classes) new_bce_targets = target_transform_for_elementwise_bce( targets, num_sampling).cuda() temp2 = args.bce_scale * criterion_BCE( F.sigmoid(outputs[:, :num_sampling]), new_bce_targets) else: bce_targets = target_transform_for_elementwise_bce( targets, num_classes).cuda() temp2 = args.bce_scale * criterion_BCE( F.sigmoid(outputs[:, :num_classes]), bce_targets).cuda() temp2_accum = temp2.detach().cpu() * ( 1. / (batch_idx + 1.)) + temp2_accum * (batch_idx / (batch_idx + 1.)) if args.sigmoid_sum is not None: # sigmoid_sum = torch.sum(F.sigmoid(full_output[:,:num_classes]), dim=1) sigmoid_sum = torch.sum(F.sigmoid( full_output[:, :num_sampling]), dim=1) sigmoid_sum_loss = F.mse_loss( sigmoid_sum, args.sigmoid_sum * torch.ones_like(sigmoid_sum)) loss += 0.5 * sigmoid_sum_loss loss += temp2 if args.sharing is not None: classifictaion_target = Variable( torch.zeros(targets.size(0)).long()).cuda() output_target_select = outputs[:, :num_sampling].gather( dim=1, index=targets.unsqueeze(1)) output_target_sharing_concat = torch.cat( (output_target_select.view( -1, 1), outputs[:, num_sampling].view(-1, 1)), 1) ce_loss = args.sharing * F.cross_entropy( F.softmax(output_target_sharing_concat), classifictaion_target) mask = torch.ones(outputs[:, :num_sampling].size()).byte().cuda() for i in range(mask.size(0)): mask[i, targets[i]] = torch.zeros(1) output_for_entropy_except_target_node = torch.masked_select( outputs[:, :num_sampling], mask) output_for_entropy_except_target_node = output_for_entropy_except_target_node.view( outputs[:, :num_sampling].size(0), -1) output_for_entropy_except_target_node = \ torch.cat((output_for_entropy_except_target_node.view(targets.size(0),-1,1), outputs[:,num_sampling].view(-1,1,1).expand(targets.size(0),num_sampling-1,1)),2) # entropy_loss = args.ent * entropy(output_for_entropy_except_target_node) entropy_loss = args.ent * sharing_entropy( output_for_entropy_except_target_node) loss += ce_loss + entropy_loss # if args.ent != 0 and args.sharing is None: # loss += entropy_loss train_loss += loss.item() max_logit, predicted = torch.max(outputs[:, :num_sampling].data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() if args.unknown_is_True: weight = 0.01 * epoch # weight = 0.1 if weight >= 0.1: weight = 0.1 select1 = (F.sigmoid(max_logit) < 0.5) select2 = (F.sigmoid(outputs[:, targets]) < 0.5) select = (select1 + select2) >= 1 if args.sepa_unknown_sharing: output_gather = F.sigmoid(outputs[:, num_classes + 1].masked_select(select)) else: output_gather = F.sigmoid( outputs[:, num_classes].masked_select(select)) node_target = torch.ones(output_gather.size()).cuda() if output_gather.size(0) > 0: # if weight >= 0.1 : # weight = 0.1 sharing_node_loss = weight * criterion_BCE( output_gather, node_target) loss += sharing_node_loss if args.sepa_unknown_sharing: select_unknown = (select1 + select2) < 1 unknown_output_gather = F.sigmoid( outputs[:, num_classes + 1].masked_select(select_unknown)) if unknown_output_gather.size(0) > 0: unknown_node_target = torch.zeros( unknown_output_gather.size()).cuda() unknown_node_loss = weight * criterion_BCE( unknown_output_gather, unknown_node_target) loss += unknown_node_loss else: pass # for i in range(outputs.size(0)): loss.backward() # Backward Propagation optimizer.step() # Optimizer update sys.stdout.write('\r') if not args.sepa_unknown_sharing: if args.sigmoid_sum is not None: sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f CE_loss : %.4f BCE_loss : %.4f Ent_loss : %.4f Sha_loss : %.4f node_loss : %.4f sum_loss : %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.item(), temp1_accum, temp2_accum, entropy_loss, ce_loss, sharing_node_loss, sigmoid_sum_loss, float(100.00 * float(correct) / float(total)))) else: sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f CE_loss : %.4f BCE_loss : %.4f Ent_loss : %.4f Sha_loss : %.4f node_loss : %.4f Acc@1: %.3f%%' % (epoch, num_epochs, batch_idx + 1, (len(trainset) // batch_size) + 1, loss.item(), temp1_accum, temp2_accum, entropy_loss, ce_loss, sharing_node_loss, float(100.00 * float(correct) / float(total)))) else: sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f CE_loss : %.4f BCE_loss : %.4f Ent_loss : %.4f \ Sha_loss : %.4f node_loss : %.4f unknown_loss : %.4f Acc@1: %.3f%%' %(epoch, num_epochs, batch_idx+1,(len(trainset)//batch_size)+1, loss.item(), temp1_accum, temp2_accum, entropy_loss, ce_loss, \ sharing_node_loss, unknown_node_loss,float(100.00*float(correct)/float(total)))) sys.stdout.flush()