def initsol(self): # generate data (by adding noise to noise-free data) torch.manual_seed(time.time()) # one observation n = torch.randn(self.N, dtype=torch.float32, requires_grad=False, device=mydevice) self.y = self.y0 + self.SNR * torch.norm(self.y0) / torch.norm(n) * n # parameters, initialized to zero x = torch.zeros(self.M, requires_grad=True, device=mydevice) def lossfunction(A, y, x, alpha=self.rho[0], beta=self.rho[1]): Ax = torch.matmul(A, x) err = y - Ax return torch.norm( err, 2)**2 + alpha * torch.norm(x, 2)**2 + beta * torch.norm(x, 1) opt = LBFGSNew([x], history_size=7, max_iter=10, line_search_fn=True, batch_mode=False) # find solution x for nepoch in range(0, 20): def closure(): if torch.is_grad_enabled(): opt.zero_grad() loss = lossfunction(self.A, self.y, x, self.rho[0], self.rho[1]) if loss.requires_grad: loss.backward() #print(loss.data.item()) return loss opt.step(closure) self.x = x
# set up primal,dual variables y1 = torch.empty(N, dtype=torch.float, requires_grad=False) y2 = torch.empty(N, dtype=torch.float, requires_grad=False) y3 = torch.empty(N, dtype=torch.float, requires_grad=False) y1.fill_(0.0) y2.fill_(0.0) y3.fill_(0.0) z = torch.empty(N, dtype=torch.float, requires_grad=False) z.fill_(0.0) #opt1=optim.Adam(filter(lambda p: p.requires_grad, net1.parameters()),lr=0.001) #opt2=optim.Adam(filter(lambda p: p.requires_grad, net2.parameters()),lr=0.001) #opt3=optim.Adam(filter(lambda p: p.requires_grad, net3.parameters()),lr=0.001) opt1 = LBFGSNew(filter(lambda p: p.requires_grad, net1.parameters()), history_size=10, max_iter=4, line_search_fn=True, batch_mode=True) opt2 = LBFGSNew(filter(lambda p: p.requires_grad, net2.parameters()), history_size=10, max_iter=4, line_search_fn=True, batch_mode=True) opt3 = LBFGSNew(filter(lambda p: p.requires_grad, net3.parameters()), history_size=10, max_iter=4, line_search_fn=True, batch_mode=True) # need to scale rho down when starting from scratch rho = 0.001
def hyperparameters_tuning_LBFGS_new_minibatch2(trainset, valset, batchsize_grid, max_iter_grid, epochs, model_NN): training_loss = [] test_loss = [] training_accuracy = [] test_accuracy = [] times = [] parameters = [] results = [] Names = [ "training_loss", "training_accuracy", "test_loss", "test_accuracy", "times", "parameters: batch iter" ] results.append(Names) for i in range(len(batchsize_grid)): bs = batchsize_grid[i] max_iter_ = max_iter_grid[i] trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True) valloader = torch.utils.data.DataLoader(valset, batch_size=bs, shuffle=True) dataiter = iter(trainloader) images, _ = dataiter.next() image_size = images[0].shape[1] input_size = int(image_size**2) output_size = 10 print("Minibatch size: ", bs) print("History size: ", max_iter_) parameter = [] if model_NN == "FCNN": sizes = [input_size, 128, 64, output_size] model = fully_connected_NN(sizes) criterion = nn.NLLLoss() optimizer = LBFGSNew(model.parameters(), max_iter=max_iter_, history_size=max_iter_, line_search_fn=True, batch_mode=True) elif model_NN == "CNN": model = ConvNet(image_size) criterion = nn.CrossEntropyLoss() optimizer = LBFGSNew(model.parameters(), max_iter=max_iter_, history_size=max_iter_, line_search_fn=True, batch_mode=True) if model_NN == "FCNN": train_losses, test_losses, train_accuracies, test_accuracies, train_time = optimize( optimizer, epochs, trainloader, valloader, model, criterion, method="LBFGS") elif model_NN == "CNN": train_losses, test_losses, train_accuracies, test_accuracies, train_time = optimize_CNN( optimizer, epochs, trainloader, valloader, model, criterion, method="LBFGS") # save the parameters parameter = [] parameter.append(bs) parameter.append(max_iter_) parameters.append(parameter) training_loss.append(train_losses) test_loss.append(test_losses) training_accuracy.append(train_accuracies) test_accuracy.append(test_accuracies) results.append(training_loss) results.append(training_accuracy) results.append(test_loss) results.append(test_accuracy) results.append(parameters) return results
param1.data.copy_(X[cnt:cnt + numel].view_as(param1.data)) param2.data.copy_(X[cnt:cnt + numel].view_as(param2.data)) param3.data.copy_(X[cnt:cnt + numel].view_as(param3.data)) cnt += numel from lbfgsnew import LBFGSNew # custom optimizer import torch.optim as optim criterion1 = nn.CrossEntropyLoss() criterion2 = nn.CrossEntropyLoss() criterion3 = nn.CrossEntropyLoss() #optimizer1=optim.Adam(net1.parameters(), lr=0.001) #optimizer2=optim.Adam(net2.parameters(), lr=0.001) #optimizer3=optim.Adam(net3.parameters(), lr=0.001) optimizer1 = LBFGSNew(net1.parameters(), history_size=10, max_iter=4, line_search_fn=True, batch_mode=True) optimizer2 = LBFGSNew(net2.parameters(), history_size=10, max_iter=4, line_search_fn=True, batch_mode=True) optimizer3 = LBFGSNew(net3.parameters(), history_size=10, max_iter=4, line_search_fn=True, batch_mode=True) start_time = time.time() # train network LBFGS 12, other 60
unfreeze_one_layer(net_dict[ck], ci) else: unfreeze_one_block(net_dict[ck], ci) trainable = filter(lambda p: p.requires_grad, net_dict[0].parameters()) params_vec1 = torch.cat([x.view(-1) for x in list(trainable)]) # number of parameters trained N = params_vec1.numel() z = torch.empty(N, dtype=torch.float, requires_grad=False) z.fill_(0.0) opt_dict = {} for ck in range(K): opt_dict[ck] = LBFGSNew(filter(lambda p: p.requires_grad, net_dict[ck].parameters()), history_size=10, max_iter=4, line_search_fn=True, batch_mode=True) #opt_dict[ck]=optim.Adam(filter(lambda p: p.requires_grad, net_dict[ck].parameters()),lr=0.001) ############# loop 1 (Federated avaraging for subset of model) for nadmm in range(Nadmm): ##### loop 2 (data) (all network updates are done per epoch, because K is large ##### and data per host is assumed to be small) for epoch in range(Nepoch): #### loop 3 (models) for ck in range(K): running_loss = 0.0 for i, data1 in enumerate(trainloader_dict[ck], 0):
def train(model_name, model, trainloader, testloader, device, opt, nb_epochs, lr=0.001): history_loss = [] history_acc = [] criterion = nn.CrossEntropyLoss() print("Using optimizer: ", opt) #TODO adjust optimizer hyperparameters if opt == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) elif opt == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr) elif opt == 'lbfgs': optimizer = LBFGSNew(model.parameters(), history_size=7, max_iter=2, line_search_fn=True, batch_mode=True) #optimizer = optim.LBFGS(model.parameters()) else: raise NotImplementedError for epoch in range(nb_epochs): # Train for each epoch model.train() running_loss = 0.0 for batch_idx, data in enumerate(trainloader, 0): inputs, labels = data[0].to(device), data[1].to(device) if opt == 'lbfgs': # Def Closure def closure(): if torch.is_grad_enabled(): optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) if loss.requires_grad: loss.backward() return loss optimizer.step(closure) outputs = model(inputs) loss = criterion(outputs, labels) else: optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() #if batch_idx % 100 == 99: # print every 100 mini-batches # print('[{}, {}] loss: {}'.format(epoch + 1, i + 1, running_loss / 100)) # running_loss = 0.0 # Test for each epoch epoch_loss = running_loss / (batch_idx + 1) epoch_acc = test(model, testloader, device) print("Epoch {} train loss: {}, test acc: {}".format( epoch + 1, epoch_loss, epoch_acc)) history_loss.append(epoch_loss) history_acc.append(epoch_acc) print('Finished Training') with open('history_loss_mnist' + '_' + model_name + '_' + opt + '.json', 'w') as f: json.dump(history_loss, f) with open('history_acc_mnist' + '_' + model_name + '_' + opt + '.json', 'w') as f: json.dump(history_acc, f)
def step(self, action, keepnoise=False): done = False # make sure to return True at some point # update state based on the action rho = scale*(action) self.rho = action * (HIGH - LOW) / 2 + (HIGH + LOW) / 2 penalty = 0 # make sure rho stays within limits, if this happens, add a penalty for ci in range(self.K): if self.rho[ci] < LOW: self.rho[ci] = LOW penalty += -0.1 if self.rho[ci] > HIGH: self.rho[ci] = HIGH penalty += -0.1 # generate data (by adding noise to noise-free data) if not keepnoise: torch.manual_seed(time.time()) n = torch.randn(self.N, dtype=torch.float32, requires_grad=False, device=mydevice) y = self.y0 + self.SNR * torch.norm(self.y0) / torch.norm(n) * n else: y = self.y # parameters, initialized to zero x = torch.zeros(self.M, requires_grad=True, device=mydevice) def lossfunction(A, y, x, alpha=self.rho[0], beta=self.rho[1]): Ax = torch.matmul(A, x) err = y - Ax return torch.norm( err, 2)**2 + alpha * torch.norm(x, 2)**2 + beta * torch.norm(x, 1) opt = LBFGSNew([x], history_size=7, max_iter=10, line_search_fn=True, batch_mode=False) # find solution x for nepoch in range(0, 20): def closure(): if torch.is_grad_enabled(): opt.zero_grad() loss = lossfunction(self.A, y, x, self.rho[0], self.rho[1]) if loss.requires_grad: loss.backward() #print(loss.data.item()) return loss opt.step(closure) # Jacobian of model = A jac = jacobian(torch.matmul(self.A, x), x).to(mydevice) # right hand term = -2 A^T df_dx = (lambda yi: gradient( lossfunction(self.A, yi, x, self.rho[0], self.rho[1]), x)) # no need to pass one-hot vectors, because we calculate d( )/dy^T in one go e = torch.ones_like(y) # all ones ll = torch.autograd.functional.jacobian(df_dx, e) mm = torch.zeros_like(ll).to(mydevice) # copy ll because it is modified for i in range(self.N): ll2 = ll[:, i].clone().detach() mm[:, i] = inv_hessian_mult(opt, ll2) # multiply by Jacobian of model B = torch.matmul(jac, mm).to('cpu') #print(B) # eigenvalues E, _ = torch.linalg.eig(B) # 1+eigenvalues (only real part), sorted in ascending order EE = E.real + 1 # remember this for rendering later self.x = x observation = {'A': self.A.view(-1).cpu(), 'eig': EE} # final error ||Ax-y|| final_err = torch.norm(torch.matmul(self.A, x) - y, 2).detach() # reward : penalize by adding -penalty # residual: normalize by data power, eigenvalues = normalize by min/max reward = torch.norm( y, 2) / final_err + torch.min(EE) / torch.max(EE) + penalty #reward.clamp_(-1,1) # clip to [-1,1] - only useful for multiple environments, not here # info : meta details {} info = {} return observation, reward, done, info
for i in range(Niter): # get the inputs patchx,patchy,inputs,uvcoords=get_data_minibatch(file_list,sap_list,batch_size=default_batch,patch_size=patch_size,normalize_data=True,num_channels=num_in_channels,uvdist=True) # wrap them in variable x=Variable(inputs).to(mydevice) uv=Variable(uvcoords).to(mydevice) (nbatch,nchan,nx,ny)=inputs.shape # nbatch = patchx x patchy x default_batch # i.e., one baseline (per polarization, real,imag) will create patchx x patchy batches batch_per_bline=patchx*patchy X=torch.transpose(x.view(-1,L),0,1) # setup S for this data batch S=torch.rand((M,nbatch),requires_grad=True,dtype=torch.float32,device=mydevice) # setup optimizer optimizer=LBFGSNew([S],history_size=7,max_iter=10,line_search_fn=True,batch_mode=True) def closure(): if torch.is_grad_enabled(): optimizer.zero_grad() # loss loss=criterion(X,torch.matmul(A,S))/(nbatch*L)+lambda1*torch.linalg.norm(S,1)/S.numel() if loss.requires_grad: #print('%d %d %e'%(epoch,i,loss.data.item())) loss.backward() return loss optimizer.step(closure) with torch.no_grad(): # now update A E=X-torch.matmul(A,S)
def main(): best_prec1 = 0 # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) trainable = args.trainable if trainable == "none": trainable = "freeze" if args.use_lbfgs: opt = "LBFGS" else: opt = "SGD" if args.wide_resnet: wide = "wide" else: wide = "" datetime = date.today().strftime("%b-%d-%Y") exp_name = "{}{}{}{}_{}".format(args.arch, wide, opt, trainable, datetime) logfile = open(os.path.join(args.save_dir, "{}.txt".format(exp_name)), "a") if args.wide_resnet: # use wide residual net https://arxiv.org/abs/1605.07146 model = torchvision.models.resnet.wide_resnet50_2() else: model = resnet.__dict__[args.arch]() if trainable == "freeze": freeze_model(model) elif trainable == "bn": freeze_model(model) unfreeze_model(model, ["gamma", "beta"]) print(test(model), file=logfile) model.cuda() if args.use_lbfgs: optimizer = LBFGSNew(model.parameters(), history_size=7, max_iter=2, line_search_fn=True, batch_mode=True) else: optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.arch in ['resnet1202', 'resnet110']: # for resnet1202 original paper uses lr=0.01 for first 400 minibatches for warm-up # then switch back. In this setup it will correspond for first epoch. for param_group in optimizer.param_groups: param_group['lr'] = args.lr * 0.1 lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[100, 150], last_epoch=args.start_epoch - 1) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) torch.manual_seed(0) train_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root='./data', train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor(), normalize, ]), download=True), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) torch.manual_seed(0) val_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root='./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=128, shuffle=False, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.half: model.half() criterion.half() if args.evaluate: validate(val_loader, model, criterion, logfile) return for epoch in range(args.start_epoch, args.epochs): # train for one epoch print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) train(train_loader, model, criterion, optimizer, epoch, logfile) lr_scheduler.step() # evaluate on validation set prec1 = validate(val_loader, model, criterion, logfile) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if epoch > 0 and epoch % args.save_every == 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th')) save_checkpoint( { 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join(args.save_dir, 'model.th')) logfile.close()
correct += (predicted==labels.to(mydevice)).sum() total += labels.size(0) return 100*correct//total ##################################################### lambda1=0.000001 lambda2=0.001 # loss function and optimizer import torch.optim as optim from lbfgsnew import LBFGSNew # custom optimizer criterion=nn.CrossEntropyLoss() #optimizer=optim.SGD(net.parameters(), lr=0.001, momentum=0.9) #optimizer=optim.Adam(net.parameters(), lr=0.001) optimizer = LBFGSNew(net.parameters(), history_size=7, max_iter=2, line_search_fn=True,batch_mode=True) load_model=False # update from a saved model if load_model: checkpoint=torch.load('./res18.model',map_location=mydevice) net.load_state_dict(checkpoint['model_state_dict']) net.train() # initialize for training (BN,dropout) start_time=time.time() use_lbfgs=True # train network for epoch in range(20): running_loss=0.0 for i,data in enumerate(trainloader,0):
params_vec1 = torch.cat([x.view(-1) for x in list(trainable)]) N = params_vec1.numel() del trainable, params_vec1 z = torch.zeros(N, dtype=torch.float, requires_grad=False).to(mydevice, non_blocking=True) opt_dict = {} for ck in range(K): if mdl == 0: #opt_dict[ck]=optim.Adam(filter(lambda p: p.requires_grad, encoder_dict[ck].parameters()),lr=0.0001) opt_dict[ck] = LBFGSNew(filter( lambda p: p.requires_grad, encoder_dict[ck].parameters()), history_size=7, max_iter=2, line_search_fn=True, batch_mode=True) elif mdl == 1: #opt_dict[ck]=optim.Adam(filter(lambda p: p.requires_grad, contextgen_dict[ck].parameters()),lr=0.0001) opt_dict[ck] = LBFGSNew(filter( lambda p: p.requires_grad, contextgen_dict[ck].parameters()), history_size=7, max_iter=2, line_search_fn=True, batch_mode=True) else: #opt_dict[ck]=optim.Adam(filter(lambda p: p.requires_grad, predictor_dict[ck].parameters()),lr=0.0001) opt_dict[ck] = LBFGSNew(filter(