def train(dataset, base_model, niter, preference): print("Preference Vector = {}".format(preference)) # LOAD DATASET # ------------ if dataset == 'emotion': with open('data/emotion.pkl', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX).float() trainLabel = torch.from_numpy(trainLabel).float() testX = torch.from_numpy(testX).float() testLabel = torch.from_numpy(testLabel).float() n_tasks = testLabel.shape[1] n_feats = testX.shape[1] train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # ---------***--------- # DEFINE MODEL # --------------------- model = RegressionTrain(RegressionModel(n_feats, n_tasks)) # model.randomize() if torch.cuda.is_available(): model.cuda() # ---------***--------- # DEFINE OPTIMIZERS # ----------------- # Choose different optimizers for different base model optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.) # scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, milestones=[15, 30, 45, 60, 75, 90], gamma=0.8) # ---------***--------- # CONTAINERS FOR KEEPING TRACK OF PROGRESS # ---------------------------------------- task_train_losses = [] train_accs = [] # ---------***--------- # TRAIN # ----- for t in range(niter + 1): # scheduler.step() model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() if torch.cuda.is_available: alpha = torch.from_numpy(preference).cuda() else: alpha = torch.from_numpy(preference) # Optimization step optimizer.zero_grad() task_losses = model(X, ts) weighted_loss = torch.sum(task_losses * alpha) # * 5. * max(epo_lp.mu_rl, 0.2) weighted_loss.backward() optimizer.step() # Calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim=0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) print('{}/{}: train_loss={}'.format( t + 1, niter, task_train_losses[-1])) # torch.save(model.model.state_dict(), # f'./saved_model/{dataset}_{base_model}_niter_{niter}.pickle') result = {"training_losses": task_train_losses} return result
def train(dataset, base_model, niter, preference): print("Preference Vector = {}".format(preference)) # LOAD DATASET # ------------ # MultiMNIST: multi_mnist.pickle if dataset == 'rf1': with open('data/rf1.pkl', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX).float() trainLabel = torch.from_numpy(trainLabel).float() testX = torch.from_numpy(testX).float() testLabel = torch.from_numpy(testLabel).float() n_tasks = testLabel.shape[1] n_feats = testX.shape[1] train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # ---------***--------- # DEFINE MODEL # --------------------- model = RegressionTrain(RegressionModel(n_feats, n_tasks)) _, n_params = getNumParams(model.parameters()) print(f"# params={n_params}; # layers={len(model.model.layers)}") # model.randomize() if torch.cuda.is_available(): model.cuda() # ---------***--------- # DEFINE OPTIMIZERS # ----------------- optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.) # scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, milestones=[15, 30, 45, 60, 75, 90], gamma=0.8) # Instantia EPO Linear Program Solver epo_lp = EPO_LP(m=n_tasks, n=n_params, r=preference) # ---------***--------- # CONTAINERS FOR KEEPING TRACK OF PROGRESS # ---------------------------------------- task_train_losses = [] train_accs = [] # ---------***--------- # TRAIN # ----- for t in range(niter): # scheduler.step() n_linscalar_adjusts = 0 descent = 0. model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # Obtain losses and gradients grads = {} losses = [] for i in range(n_tasks): optimizer.zero_grad() task_loss = model(X, ts) losses.append(task_loss[i].data.cpu().numpy()) task_loss[i].backward() # One can use scalable method proposed in the MOO-MTL paper # for large scale problem; but we use the gradient # of all parameters in this experiment. grads[i] = [] for param in model.parameters(): if param.grad is not None: grads[i].append( Variable(param.grad.data.clone().flatten(), requires_grad=False)) grads_list = [torch.cat(grads[i]) for i in range(len(grads))] G = torch.stack(grads_list) GG = G @ G.T losses = np.stack(losses) try: # Calculate the alphas from the LP solver alpha = epo_lp.get_alpha(losses, G=GG.cpu().numpy(), C=True) if epo_lp.last_move == "dom": descent += 1 except Exception as e: # print(e) # print(f'losses:{losses}') # print(f'C:\n{GG.cpu().numpy()}') # raise RuntimeError('manual tweak') alpha = None if alpha is None: # A patch for the issue in cvxpy alpha = preference / preference.sum() n_linscalar_adjusts += 1 if torch.cuda.is_available: alpha = n_tasks * torch.from_numpy(alpha).cuda() else: alpha = n_tasks * torch.from_numpy(alpha) # Optimization step optimizer.zero_grad() task_losses = model(X, ts) weighted_loss = torch.sum(task_losses * alpha) # * 5. * max(epo_lp.mu_rl, 0.2) weighted_loss.backward() optimizer.step() print(f"\tdescent={descent/len(train_loader)}") if n_linscalar_adjusts > 0: print(f"\t # linscalar steps={n_linscalar_adjusts}") # Calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim=0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) print('{}/{}: train_loss={}'.format(t + 1, niter, task_train_losses[-1])) # torch.save(model.model.state_dict(), # f'./saved_model/{dataset}_{base_model}_niter_{niter}.pickle') result = {"training_losses": task_train_losses} return result
def train(dataset, base_model, niter, npref, rvecs, pref_idx): # generate #npref preference vectors ref_vec = torch.tensor(rvecs).cuda().float() # load dataset if dataset == 'emotion': with open('data/emotion.pkl', 'rb') as f: trainX, trainLabel, testX, testLabel = pickle.load(f) trainX = torch.from_numpy(trainX).float() trainLabel = torch.from_numpy(trainLabel).float() testX = torch.from_numpy(testX).float() testLabel = torch.from_numpy(testLabel).float() n_tasks = testLabel.shape[1] n_feats = testX.shape[1] train_set = torch.utils.data.TensorDataset(trainX, trainLabel) test_set = torch.utils.data.TensorDataset(testX, testLabel) batch_size = 256 train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False) print('==>>> total trainning batch number: {}'.format(len(train_loader))) print('==>>> total testing batch number: {}'.format(len(test_loader))) # ---------***--------- # DEFINE MODEL # --------------------- model = RegressionTrain(RegressionModel(n_feats, n_tasks)) # model.randomize() if torch.cuda.is_available(): model.cuda() # ---------***--------- # DEFINE OPTIMIZERS # ----------------- optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.) # scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, milestones=[15, 30, 45, 60, 75, 90], gamma=0.8) # store infomation during optimization weights = [] task_train_losses = [] train_accs = [] # print the current preference vector print('Preference Vector ({}/{}):'.format(pref_idx + 1, npref)) print(ref_vec[pref_idx].cpu().numpy()) # run at most 2 epochs to find the initial solution # stop early once a feasible solution is found # usually can be found with a few steps for t in range(2): model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() grads = {} losses_vec = [] # obtain and store the gradient value for i in range(n_tasks): optimizer.zero_grad() task_loss = model(X, ts) losses_vec.append(task_loss[i].data) task_loss[i].backward() grads[i] = [] # can use scalable method proposed in the MOO-MTL paper for large scale problem # but we keep use the gradient of all parameters in this experiment for param in model.parameters(): if param.grad is not None: grads[i].append( Variable(param.grad.data.clone().flatten(), requires_grad=False)) grads_list = [torch.cat(grads[i]) for i in range(len(grads))] grads = torch.stack(grads_list) # calculate the weights losses_vec = torch.stack(losses_vec) flag, weight_vec = get_d_paretomtl_init(grads, losses_vec, ref_vec, pref_idx) # early stop once a feasible solution is obtained if flag == True: print("fealsible solution is obtained.") break # print(f'len(weight_vec)={len(weight_vec)}') # optimization step optimizer.zero_grad() for i in range(len(task_loss)): task_loss = model(X, ts) if i == 0: loss_total = weight_vec[i] * task_loss[i] else: loss_total = loss_total + weight_vec[i] * task_loss[i] loss_total.backward() optimizer.step() else: # continue if no feasible solution is found continue # break the loop once a feasible solutions is found break # run niter epochs of ParetoMTL for t in range(niter): # scheduler.step() model.train() for (it, batch) in enumerate(train_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() # obtain and store the gradient grads = {} losses_vec = [] for i in range(n_tasks): optimizer.zero_grad() task_loss = model(X, ts) losses_vec.append(task_loss[i].data) task_loss[i].backward() # can use scalable method proposed in the MOO-MTL paper for large scale problem # but we keep use the gradient of all parameters in this experiment grads[i] = [] for param in model.parameters(): if param.grad is not None: grads[i].append( Variable(param.grad.data.clone().flatten(), requires_grad=False)) grads_list = [torch.cat(grads[i]) for i in range(len(grads))] grads = torch.stack(grads_list) # calculate the weights losses_vec = torch.stack(losses_vec) weight_vec = get_d_paretomtl(grads, losses_vec, ref_vec, pref_idx) # normalize_coeff = n_tasks / torch.sum(torch.abs(weight_vec)) normalize_coeff = 1. / torch.sum(torch.abs(weight_vec)) weight_vec = weight_vec * normalize_coeff # optimization step optimizer.zero_grad() for i in range(len(task_loss)): task_loss = model(X, ts) if i == 0: loss_total = weight_vec[i] * task_loss[i] else: loss_total = loss_total + weight_vec[i] * task_loss[i] loss_total.backward() optimizer.step() # calculate and record performance # Calculate and record performance if t == 0 or (t + 1) % 2 == 0: model.eval() with torch.no_grad(): total_train_loss = [] for (it, batch) in enumerate(test_loader): X = batch[0] ts = batch[1] if torch.cuda.is_available(): X = X.cuda() ts = ts.cuda() valid_train_loss = model(X, ts) total_train_loss.append(valid_train_loss) total_train_loss = torch.stack(total_train_loss) average_train_loss = torch.mean(total_train_loss, dim=0) # record and print if torch.cuda.is_available(): task_train_losses.append(average_train_loss.data.cpu().numpy()) print('{}/{}: train_loss={}'.format(t + 1, niter, task_train_losses[-1])) # torch.save(model.model.state_dict(), # f'./saved_model/{dataset}_{base_model}_niter_{niter}.pickle') result = {"training_losses": task_train_losses} return result