def train(dataset, base_model, niter, j): preference = np.array([1. - j, j]) n_tasks = 2 print("Preference Vector = {}".format(preference)) # LOAD DATASET # ------------ # MultiMNIST: multi_mnist.pickle if dataset == 'mnist': with open('data/multi_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # MultiFashionMNIST: multi_fashion.pickle if dataset == 'fashion': with open('data/multi_fashion.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle if dataset == 'fashion_and_mnist': with open('data/multi_fashion_and_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX.reshape(120000, 1, 36, 36)).float() trainLabel = torch.from_numpy(trainLabel).long() testX = torch.from_numpy(testX.reshape(20000, 1, 36, 36)).float() testLabel = torch.from_numpy(testLabel).long() train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # ---------***--------- # DEFINE MODEL # --------------------- if base_model == 'lenet': model = RegressionTrain(RegressionModel(n_tasks), preference) if base_model == 'resnet18': model = RegressionTrainResNet(MnistResNet(n_tasks), preference) if torch.cuda.is_available(): model.cuda() # ---------***--------- # DEFINE OPTIMIZERS # ----------------- # Choose different optimizers for different base model if base_model == 'lenet': optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.0) # scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, milestones=[15, 30, 45, 60, 75, 90], gamma=0.5) if base_model == 'resnet18': optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1) # ---------***--------- # CONTAINERS FOR KEEPING TRACK OF PROGRESS # ---------------------------------------- weights = [] task_train_losses = [] train_accs = [] # ---------***--------- # TRAIN # ----- for t in range(niter): scheduler.step() n_manual_adjusts = 0 model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # Update using only j th task optimizer.zero_grad() task_j_loss = model(X, ts, j) task_j_loss.backward() optimizer.step() if n_manual_adjusts > 0: print(f"\t # manual tweek={n_manual_adjusts}") # Calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] train_acc = [] correct1_train = 0 correct2_train = 0 for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) output1 = model.model(X).max(2, keepdim=True)[1][:, 0] output2 = model.model(X).max(2, keepdim=True)[1][:, 1] correct1_train += output1.eq( ts[:, 0].view_as(output1)).sum().item() correct2_train += output2.eq( ts[:, 1].view_as(output2)).sum().item() train_acc = np.stack([ 1.0 * correct1_train / len(train_loader.dataset), 1.0 * correct2_train / len(train_loader.dataset) ]) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim=0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) train_accs.append(train_acc) print('{}/{}: train_loss={}, train_acc={}'.format( t + 1, niter, task_train_losses[-1], train_accs[-1])) # weights.append(weight_vec.cpu().numpy()) # print('{}/{}: weights={}, train_loss={}, train_acc={}'.format( # t + 1, niter, weights[-1], task_train_losses[-1], train_accs[-1])) # torch.save(model.model.state_dict(), './saved_model/%s_%s_niter_%d.pickle' % # (dataset, base_model, niter, npref)) torch.save(model.model.state_dict(), f'./saved_model/{dataset}_{base_model}_niter_{niter}.pickle') result = { "training_losses": task_train_losses, "training_accuracies": train_accs } return result
def train(dataset, base_model, niter, npref, init_weight, pref_idx, leak): # generate #npref preference vectors n_tasks = 2 # load dataset # MultiMNIST: multi_mnist.pickle if dataset == 'mnist': with open('data/multi_mnist.pickle','rb') as f: trainX, trainLabel,testX, testLabel = pickle.load(f) # MultiFashionMNIST: multi_fashion.pickle if dataset == 'fashion': with open('data/multi_fashion.pickle','rb') as f: trainX, trainLabel,testX, testLabel = pickle.load(f) # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle if dataset == 'fashion_and_mnist': with open('data/multi_fashion_and_mnist.pickle','rb') as f: trainX, trainLabel,testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX.reshape(120000,1,36,36)).float() trainLabel = torch.from_numpy(trainLabel).long() testX = torch.from_numpy(testX.reshape(20000,1,36,36)).float() testLabel = torch.from_numpy(testLabel).long() train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # define the base model for ParetoMTL if base_model == "lenet": model = RegressionTrain(RegressionModel(n_tasks), init_weight) if base_model == "resnet18": model = RegressionTrainResNet(MnistResnNet(n_tasks), init_weight) if torch.cuda.is_available(): model.cuda() # choose different optimizer for different base model if base_model == 'lenet': optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,30,45,60,75,90], gamma=0.5) if base_model == 'resnet18': optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20], gamma=0.1) # store infomation during optimization task_train_losses = [] train_accs = [] # run niter epochs of MGDA for t in range(niter): # scheduler.step() model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # obtain and store the gradient flat_grads = {} for i in range(n_tasks): optimizer.zero_grad() task_loss = model(X, ts) task_loss[i].backward() flat_grads[i] = flatten_grad(model.parameters()) grads = [flat_grads[i]["grad"] for i in range(len(flat_grads))] grads = torch.stack(grads) # calculate the gradient grads = get_d_graddrop(grads, leak) grads = recover_flattened(grads, flat_grads[0]["indices"], flat_grads[0]["shapes"]) # optimization step optimizer.zero_grad() for i, params in enumerate(model.parameters()): if grads[i] is not None: params.grad = grads[i] optimizer.step() # calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] train_acc = [] correct1_train = 0 correct2_train = 0 for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) output1 = model.model(X).max(2, keepdim=True)[1][:,0] output2 = model.model(X).max(2, keepdim=True)[1][:,1] correct1_train += output1.eq(ts[:,0].view_as(output1)).sum().item() correct2_train += output2.eq(ts[:,1].view_as(output2)).sum().item() train_acc = np.stack([1.0 * correct1_train / len(test_loader.dataset),1.0 * correct2_train / len(test_loader.dataset)]) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim = 0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) train_accs.append(train_acc) print('{}/{}: train_loss={}, train_acc={}'.format( t + 1, niter, task_train_losses[-1],train_accs[-1])) result = {"training_losses": task_train_losses, "training_accuracies": train_accs} return result, model
def train(dataset, base_model, niter, npref, init_weight, pref_idx): # generate #npref preference vectors n_tasks = 2 # ref_vec = torch.tensor(circle_points([1], [npref])[0]).cuda().float() rvecs = circle_points(npref, min_angle=0.0001 * np.pi / 2, max_angle=0.9999 * np.pi / 2) ref_vec = torch.tensor(rvecs).cuda().float() # load dataset # MultiMNIST: multi_mnist.pickle if dataset == 'mnist': with open('data/multi_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # MultiFashionMNIST: multi_fashion.pickle if dataset == 'fashion': with open('data/multi_fashion.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle if dataset == 'fashion_and_mnist': with open('data/multi_fashion_and_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX.reshape(120000, 1, 36, 36)).float() trainLabel = torch.from_numpy(trainLabel).long() testX = torch.from_numpy(testX.reshape(20000, 1, 36, 36)).float() testLabel = torch.from_numpy(testLabel).long() train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # define the base model for ParetoMTL model = RegressionTrain(RegressionModel(n_tasks), init_weight) if torch.cuda.is_available(): model.cuda() # choose different optimizer for different base model if base_model == 'lenet': optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,30,45,60,75,90], gamma=0.5) if base_model == 'resnet18': optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1) # store infomation during optimization weights = [] task_train_losses = [] train_accs = [] # print the current preference vector print('Preference Vector ({}/{}):'.format(pref_idx + 1, npref)) print(ref_vec[pref_idx].cpu().numpy()) # run at most 2 epochs to find the initial solution # stop early once a feasible solution is found # usually can be found with a few steps for t in range(2): model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() grads = {} losses_vec = [] # obtain and store the gradient value for i in range(n_tasks): optimizer.zero_grad() task_loss = model(X, ts) losses_vec.append(task_loss[i].data) task_loss[i].backward() grads[i] = [] # can use scalable method proposed in the MOO-MTL paper for large scale problem # but we keep use the gradient of all parameters in this experiment for param in model.parameters(): if param.grad is not None: grads[i].append( Variable(param.grad.data.clone().flatten(), requires_grad=False)) grads_list = [torch.cat(grads[i]) for i in range(len(grads))] grads = torch.stack(grads_list) # calculate the weights losses_vec = torch.stack(losses_vec) flag, weight_vec = get_d_paretomtl_init(grads, losses_vec, ref_vec, pref_idx) # early stop once a feasible solution is obtained if flag == True: print("fealsible solution is obtained.") break # optimization step optimizer.zero_grad() for i in range(len(task_loss)): task_loss = model(X, ts) if i == 0: loss_total = weight_vec[i] * task_loss[i] else: loss_total = loss_total + weight_vec[i] * task_loss[i] loss_total.backward() optimizer.step() else: # continue if no feasible solution is found continue # break the loop once a feasible solutions is found break # run niter epochs of ParetoMTL for t in range(niter): # scheduler.step() model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # obtain and store the gradient grads = {} losses_vec = [] for i in range(n_tasks): optimizer.zero_grad() task_loss = model(X, ts) losses_vec.append(task_loss[i].data) task_loss[i].backward() # can use scalable method proposed in the MOO-MTL paper for large scale problem # but we keep use the gradient of all parameters in this experiment grads[i] = [] for param in model.parameters(): if param.grad is not None: grads[i].append( Variable(param.grad.data.clone().flatten(), requires_grad=False)) grads_list = [torch.cat(grads[i]) for i in range(len(grads))] grads = torch.stack(grads_list) # calculate the weights losses_vec = torch.stack(losses_vec) weight_vec = get_d_paretomtl(grads, losses_vec, ref_vec, pref_idx) # normalize_coeff = n_tasks / torch.sum(torch.abs(weight_vec)) normalize_coeff = 1. / torch.sum(torch.abs(weight_vec)) weight_vec = weight_vec * normalize_coeff # optimization step optimizer.zero_grad() for i in range(len(task_loss)): task_loss = model(X, ts) if i == 0: loss_total = weight_vec[i] * task_loss[i] else: loss_total = loss_total + weight_vec[i] * task_loss[i] loss_total.backward() optimizer.step() # calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] train_acc = [] correct1_train = 0 correct2_train = 0 for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) output1 = model.model(X).max(2, keepdim=True)[1][:, 0] output2 = model.model(X).max(2, keepdim=True)[1][:, 1] correct1_train += output1.eq( ts[:, 0].view_as(output1)).sum().item() correct2_train += output2.eq( ts[:, 1].view_as(output2)).sum().item() train_acc = np.stack([ 1.0 * correct1_train / len(test_loader.dataset), 1.0 * correct2_train / len(test_loader.dataset) ]) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim=0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) train_accs.append(train_acc) weights.append(weight_vec.cpu().numpy()) print('{}/{}: weights={}, train_loss={}, train_acc={}'.format( t + 1, niter, weights[-1], task_train_losses[-1], train_accs[-1])) result = { "training_losses": task_train_losses, "training_accuracies": train_accs } return result, model
def train(dataset, base_model, niter, preference): n_tasks = 2 print("Preference Vector = {}".format(preference)) # LOAD DATASET # ------------ # MultiMNIST: multi_mnist.pickle if dataset == 'mnist': with open('data/multi_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # MultiFashionMNIST: multi_fashion.pickle if dataset == 'fashion': with open('data/multi_fashion.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle if dataset == 'fashion_and_mnist': with open('data/multi_fashion_and_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX.reshape(120000, 1, 36, 36)).float() trainLabel = torch.from_numpy(trainLabel).long() testX = torch.from_numpy(testX.reshape(20000, 1, 36, 36)).float() testLabel = torch.from_numpy(testLabel).long() train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # ---------***--------- # DEFINE MODEL # --------------------- if base_model == 'lenet': model = RegressionTrain(RegressionModel(n_tasks), preference) if base_model == 'resnet18': model = RegressionTrainResNet(MnistResNet(n_tasks), preference) # model.randomize() if torch.cuda.is_available(): model.cuda() # ---------***--------- # DEFINE OPTIMIZERS # ----------------- # Choose different optimizers for different base model if base_model == 'lenet': optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.) # scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, milestones=[15, 30, 45, 60, 75, 90], gamma=0.8) if base_model == 'resnet18': optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1) # Instantia EPO Linear Program Solver _, n_params = getNumParams(model.parameters()) print(f"# params={n_params}") epo_lp = EPO_LP(m=n_tasks, n=n_params, r=preference) # ---------***--------- # CONTAINERS FOR KEEPING TRACK OF PROGRESS # ---------------------------------------- task_train_losses = [] train_accs = [] # ---------***--------- # TRAIN # ----- for t in range(niter): # scheduler.step() n_manual_adjusts = 0 descent = 0. model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # Obtain losses and gradients grads = {} losses = [] for i in range(n_tasks): optimizer.zero_grad() task_loss = model(X, ts) losses.append(task_loss[i].data.cpu().numpy()) task_loss[i].backward() # One can use scalable method proposed in the MOO-MTL paper # for large scale problem; but we use the gradient # of all parameters in this experiment. grads[i] = [] for param in model.parameters(): if param.grad is not None: grads[i].append( Variable(param.grad.data.clone().flatten(), requires_grad=False)) grads_list = [torch.cat(grads[i]) for i in range(len(grads))] G = torch.stack(grads_list) GG = G @ G.T losses = np.stack(losses) try: # Calculate the alphas from the LP solver alpha = epo_lp.get_alpha(losses, G=GG.cpu().numpy(), C=True) if epo_lp.last_move == "dom": descent += 1 except Exception as e: print(e) alpha = None if alpha is None: # A patch for the issue in cvxpy alpha = preference / preference.sum() n_manual_adjusts += 1 if torch.cuda.is_available: alpha = n_tasks * torch.from_numpy(alpha).cuda() else: alpha = n_tasks * torch.from_numpy(alpha) # Optimization step optimizer.zero_grad() task_losses = model(X, ts) weighted_loss = torch.sum(task_losses * alpha) # * 5. * max(epo_lp.mu_rl, 0.2) weighted_loss.backward() optimizer.step() print(f"\tdescent={descent/len(train_loader)}") if n_manual_adjusts > 0: print(f"\t # manual tweek={n_manual_adjusts}") # Calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] train_acc = [] correct1_train = 0 correct2_train = 0 for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) output1 = model.model(X).max(2, keepdim=True)[1][:, 0] output2 = model.model(X).max(2, keepdim=True)[1][:, 1] correct1_train += output1.eq( ts[:, 0].view_as(output1)).sum().item() correct2_train += output2.eq( ts[:, 1].view_as(output2)).sum().item() train_acc = np.stack([ 1.0 * correct1_train / len(train_loader.dataset), 1.0 * correct2_train / len(train_loader.dataset) ]) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim=0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) train_accs.append(train_acc) print('{}/{}: train_loss={}, train_acc={}'.format( t + 1, niter, task_train_losses[-1], train_accs[-1])) torch.save(model.model.state_dict(), f'./saved_model/{dataset}_{base_model}_niter_{niter}.pickle') result = { "training_losses": task_train_losses, "training_accuracies": train_accs } return result
def train(dataset, base_model, niter, npref, init_weight, pref_idx): # generate #npref preference vectors n_tasks = 2 # load dataset print(f"loading dataset {dataset}") # MultiMNIST: multi_mnist.pickle if dataset == 'mnist': with open('data/multi_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # MultiFashionMNIST: multi_fashion.pickle if dataset == 'fashion': with open('data/multi_fashion.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle if dataset == 'fashion_and_mnist': with open('data/multi_fashion_and_mnist.pickle', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX.reshape(120000, 1, 36, 36)).float() trainLabel = torch.from_numpy(trainLabel).long() testX = torch.from_numpy(testX.reshape(20000, 1, 36, 36)).float() testLabel = torch.from_numpy(testLabel).long() train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # define the base model for ParetoMTL if base_model == "lenet": model = RegressionTrain(RegressionModel(n_tasks), init_weight) if base_model == "resnet18": model = RegressionTrainResNet(MnistResnNet(n_tasks), init_weight) if torch.cuda.is_available(): model.cuda() # choose different optimizer for different base model lr = 1e-3 # store infomation during optimization task_train_losses = [] train_accs = [] # run niter epochs of MGDA for t in range(niter): model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # compute shared gradients flat_grads = {} shared_params = [ v for k, v in model.model.get_shared_parameters().items() ] task_loss = model(X, ts) for i in range(n_tasks): shared_grads = torch.autograd.grad(task_loss[i], shared_params, retain_graph=True) flat_grads[i] = flatten_grad(shared_grads) # update task parameters for i in range(n_tasks): task_params = [ v for k, v in model.model.get_task_parameters(i).items() ] task_grads = torch.autograd.grad(task_loss[i], task_params, retain_graph=True) for index, params in enumerate(task_params): params.data = params.data - lr * task_grads[index] # compute PCGrad pcgrads = [flat_grads[i]["grad"] for i in range(len(flat_grads))] pcgrads = get_d_pcgrad(torch.stack(pcgrads)) pcgrads = recover_flattened(pcgrads, flat_grads[0]["indices"], flat_grads[0]["shapes"]) # compute original gradients oggrads = [flat_grads[i]["grad"] for i in range(len(flat_grads))] oggrads = torch.mean(torch.stack(oggrads), dim=0) oggrads = recover_flattened(oggrads, flat_grads[0]["indices"], flat_grads[0]["shapes"]) # compute transference gradient_candidates = [pcgrads, oggrads] transferences = get_transferences(model, (X, ts), gradient_candidates, task_loss, lr) gradients = gradient_candidates[torch.argmax(transferences).item()] # update shared parameters shared_params = [ v for k, v in model.model.get_shared_parameters().items() ] for index, params in enumerate(shared_params): params.data = params.data - lr * gradients[index] # clear the graph model.zero_grad() # calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] train_acc = [] correct1_train = 0 correct2_train = 0 for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) output1 = model.model(X).max(2, keepdim=True)[1][:, 0] output2 = model.model(X).max(2, keepdim=True)[1][:, 1] correct1_train += output1.eq( ts[:, 0].view_as(output1)).sum().item() correct2_train += output2.eq( ts[:, 1].view_as(output2)).sum().item() train_acc = np.stack([ 1.0 * correct1_train / len(test_loader.dataset), 1.0 * correct2_train / len(test_loader.dataset) ]) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim=0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) train_accs.append(train_acc) print('{}/{}: train_loss={}, train_acc={}'.format( t + 1, niter, task_train_losses[-1], train_accs[-1])) result = { "training_losses": task_train_losses, "training_accuracies": train_accs } return result, model
def train(dataset, base_model, niter, npref, init_weight, pref_idx, alpha=0.0): # generate #npref preference vectors n_tasks = 2 # load dataset print(f"loading dataset {dataset}") # MultiMNIST: multi_mnist.pickle if dataset == 'mnist': with open('data/multi_mnist.pickle','rb') as f: trainX, trainLabel,testX, testLabel = pickle.load(f) # MultiFashionMNIST: multi_fashion.pickle if dataset == 'fashion': with open('data/multi_fashion.pickle','rb') as f: trainX, trainLabel,testX, testLabel = pickle.load(f) # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle if dataset == 'fashion_and_mnist': with open('data/multi_fashion_and_mnist.pickle','rb') as f: trainX, trainLabel,testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX.reshape(120000,1,36,36)).float() trainLabel = torch.from_numpy(trainLabel).long() testX = torch.from_numpy(testX.reshape(20000,1,36,36)).float() testLabel = torch.from_numpy(testLabel).long() train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # define the base model for ParetoMTL if base_model == "lenet": model = RegressionTrain(RegressionModel(n_tasks), init_weight) if base_model == "resnet18": model = RegressionTrainResNet(MnistResnNet(n_tasks), init_weight) if torch.cuda.is_available(): model.cuda() # initialize weights weights = torch.ones(n_tasks) if torch.cuda.is_available(): weights = weights.cuda() weights.requires_grad_() # choose different optimizer for different base model if base_model == 'lenet': optimizer = torch.optim.SGD(list(model.parameters()) + [weights], lr=1e-3, momentum=0.) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,30,45,60,75,90], gamma=0.5) if base_model == 'resnet18': optimizer = torch.optim.Adam(list(model.parameters()) + [weights], lr=1e-3) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20], gamma=0.1) # store infomation during optimization task_train_losses = [] train_accs = [] # initialize loss initial_task_loss = None # run niter epochs of MGDA for t in range(niter): # scheduler.step() model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # compute loss optimizer.zero_grad() task_loss = model(X, ts) if initial_task_loss is None: initial_task_loss = task_loss.detach() # compute parameter gradients weighted_loss = torch.sum(task_loss * weights) weighted_loss.backward(retain_graph=True) weights.grad.data = weights.grad.data * 0. # compute gradient gradients grad_norms = [] for i in range(len(task_loss)): grad = torch.autograd.grad(task_loss[i], model.model.parameters(), retain_graph=True) grad = torch.cat([torch.flatten(x) for x in grad]) grad_norms.append(torch.linalg.norm(weights[i] * grad)) grad_norms = torch.stack(grad_norms) mean_grad_norm = torch.mean(grad_norms.detach()) loss_ratio = task_loss.detach() / initial_task_loss inverse_loss_ratio = loss_ratio / torch.mean(loss_ratio) weight_loss = torch.sum(torch.abs(grad_norms - mean_grad_norm * (inverse_loss_ratio ** alpha))) weights.grad.data = torch.autograd.grad(weight_loss, weights)[0] # SGD step optimizer.step() # normalize weights weights.data = weights.data / torch.norm(weights.data) # calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] train_acc = [] correct1_train = 0 correct2_train = 0 for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) output1 = model.model(X).max(2, keepdim=True)[1][:,0] output2 = model.model(X).max(2, keepdim=True)[1][:,1] correct1_train += output1.eq(ts[:,0].view_as(output1)).sum().item() correct2_train += output2.eq(ts[:,1].view_as(output2)).sum().item() train_acc = np.stack([1.0 * correct1_train / len(test_loader.dataset),1.0 * correct2_train / len(test_loader.dataset)]) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim = 0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) train_accs.append(train_acc) print('{}/{}: weight={} train_loss={}, train_acc={}'.format( t + 1, niter, weights, task_train_losses[-1],train_accs[-1])) result = {"training_losses": task_train_losses, "training_accuracies": train_accs} return result, model