def NLL_with_grad(self, fW): #Returns NLL w.r.t input r if (torch.is_tensor(fW) == False): lh_r = torch.tensor(fW) #cast to tensor if (lh_r.shape != (self.mdp_data['states'], 5)): #convert to full reward r = torch.matmul(self.F, lh_r) r = torch.reshape(r, (len(r), 1)) r = r.repeat((1, 5)) if r.shape != (int(mdp_data['states']), 5): print(r.shape) raise Exception("Reward shape not (states, 5)") #Solve MDP with current reward v, q, logp, p = linearvalueiteration(self.mdp_data, r) #Calculate likelihood from logp likelihood = torch.empty(self.mu_sa.shape, requires_grad=True) likelihood = torch.sum(torch.sum(logp * self.mu_sa)) #Calculate gradient #Calc gradient w.r.t to forward inputs D = torch.from_numpy(fW) D = linearmdpfrequency(self.mdp_data, p, self.initD) #Compute state visitation count D D = D.clone().detach().requires_grad_(True) #cast to tensor dr = self.muE - torch.matmul(torch.t(self.F), D) #Compute gradient return -likelihood.detach().numpy(), -dr.detach().numpy()
def likelihood(r, initD, mu_sa, muE, F, mdp_data): #Returns NLL w.r.t input r ''' if(torch.is_tensor(r) == False): r = torch.tensor(r) #cast to tensor if(r.shape != (mdp_data['states'],5)): #reformat to be in shape (states,actions) r = torch.reshape(r, (int(mdp_data['states']),1)) r = r.repeat((1, 5)) ''' if (torch.is_tensor(r) == False): r = torch.tensor(r) #cast to tensor if (r.shape != (mdp_data['states'], 5)): #convert to full reward r = torch.matmul(F, r) #Solve MDP with current reward v, q, logp, p = linearvalueiteration(mdp_data, r) #Calculate likelihood from logp likelihood = torch.empty(mu_sa.shape, requires_grad=True) likelihood = torch.sum(torch.sum(logp * mu_sa)) #for scalar likelihood #LH for each state as tensor size (states,1) #mul = logp*mu_sa #hold #likelihood = torch.sum(mul, dim=1) #likelihood.requires_grad = True return -likelihood
def calculate_EVD(self, trueP, currR): if(currR.shape != (len(currR),5)): currR = currR.repeat((1, 5)) if currR.shape != trueP.shape: raise Exception("Reward shapenot (states, 5) instead it's", + str(currR.shape)) v, q, logp, currP = linearvalueiteration(self.mdp_data, currR) #Expected Value Diff = diff in policies since exact True R values never actually learned, only it's structure evd=torch.max(torch.abs(currP-trueP)) return evd
def backward(ctx, grad_output): #print('Grad output {}'.format(grad_output)) #Should return as many gradient tesnors w.r.t to inputs r, initD, mu_sa, muE, F, sa_p, sa_s, states, actions, discount, determinism = ctx.saved_tensors #reconsrtuct mdp_data since ctx can't save dicts mdp_data = { 'states':states, 'actions':actions, 'discount':discount, 'determinism':determinism, 'sa_s':sa_s, 'sa_p':sa_p } ''' if(torch.is_tensor(r) == False): r = torch.tensor(r) #cast to tensor if(r.shape != (mdp_data['states'],5)): #reformat to be in shape (states,actions) r = torch.reshape(r, (int(mdp_data['states']),1)) r = r.repeat((1, 5)) ''' if(torch.is_tensor(r) == False): r = torch.tensor(r) #cast to tensor if(r.shape != (mdp_data['states'],5)): #convert to full reward r = torch.matmul(F, r) r = r.repeat((1, 5)) # ''' if r.shape != (int(mdp_data['states']),5): raise Exception("Reward shapenot (states, 5) instead it's", + str(r.shape)) #Solve MDP with current reward v, q, logp, p = linearvalueiteration(mdp_data, r) #Calc gradient w.r.t to forward inputs D = grad_output.clone() D = linearmdpfrequency(mdp_data,p,initD)#Compute state visitation count D D = D.clone().detach().requires_grad_(True) #cast to tensor #D = torch.tensor(D.clone().detach().requires_grad_(True)) #Cast to tensor dr = muE - torch.matmul(torch.t(F),D) #Compute gradient #print('gradient:\n',-dr) #dr = dr.repeat((1, 5)) #make shape [states, 5] #dr = dr.view(len(dr)) #Make row vector return -dr, None, None, None, None, None #return -dr for descent
def forward(ctx, r, initD, mu_sa, muE, F, mdp_data): #Returns NLL w.r.t input r ctx.save_for_backward(r, initD, mu_sa, muE, F, mdp_data['sa_p'], mdp_data['sa_s'], torch.tensor(mdp_data['states']), torch.tensor(mdp_data['actions']), torch.tensor(mdp_data['discount']), torch.tensor(mdp_data['determinism'])) ''' if(torch.is_tensor(r) == False): r = torch.tensor(r) #cast to tensor if(r.shape != (mdp_data['states'],5)): #reformat to be in shape (states,actions) r = torch.reshape(r, (int(mdp_data['states']),1)) r = r.repeat((1, 5)) ''' if(torch.is_tensor(r) == False): r = torch.tensor(r) #cast to tensor if(r.shape != (mdp_data['states'],5)): #convert to full reward r = torch.matmul(F, r) r = r.repeat((1, 5)) #''' if r.shape != (int(mdp_data['states']),5): raise Exception("Reward shape not (states, 5) instead it's", + str(r.shape)) #Solve MDP with current reward v, q, logp, p = linearvalueiteration(mdp_data, r) #Calculate likelihood from logp likelihood = torch.empty(mu_sa.shape, requires_grad=True) likelihood = torch.sum(torch.sum(logp*mu_sa)) #for scalar likelihood #LH for each state as tensor size (states,1) #mul = logp*mu_sa #hold #likelihood = torch.sum(mul, dim=1) #likelihood.requires_grad = True return -likelihood
def forward(self, r): print('Init D {}'.format(initD)) #Returns NLL w.r.t input r r = self.reshapeReward(r) self.initD = torch.reshape(self.initD, (self.mdp_data['states'], 1)) v, q, logp, p = linearvalueiteration(self.mdp_data, r) #Solve MDP with current reward self.p = p #set policy w.r.t current r for backward #Calculate likelihood from logp #torch.sum likelihood = sum(sum(logp * self.mu_sa)) #for scalar likelihood #mul = logp*self.mu_sa #hold #likelihood = torch.sum(mul, dim=1)#likelihood for each state as tensor size (states,1) #likelihood = likelihood.view(self.mdp_data['states'],1) #make column vector #likelihood.requires_grad = True return -likelihood
def scipy(self, lh): estR = np.random.randn(mdp_params['n']**2, 1) # initial estimated R res = minimize(lh.negated_likelihood_with_grad, estR, jac=True, method="L-BFGS-B", options={ 'disp': True, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 15000, 'ftol': 2.220446049250313e-09, 'maxcor': 10, 'maxfun': 15000}) # reshape foundR & find it's likelihood foundR = torch.reshape(torch.tensor(res.x), (4, 1)) foundR = foundR.repeat(1, 5) print(foundR.dtype) foundLH = lh.negated_likelihood(foundR) # solve MDP with foundR for optimal policy v, q, logp, foundp = linearvalueiteration(mdp_data, foundR) found_optimal_policy = np.argmax(foundp.detach().cpu().numpy(), axis=1) print("\nTrue R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format( *trueRprintlist)) # Print found R stats foundRprintlist = [foundR, foundLH, found_optimal_policy] print("\nFound R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format( *foundRprintlist))
if normalise: #Scale everything within 0 and 1 scaler = MinMaxScaler() y_mc_relu = scaler.fit_transform(y_mc_relu.reshape(-1, 1)) y_mc_std_relu = scaler.fit_transform(y_mc_std_relu.reshape(-1, 1)) y_mc_tanh = scaler.fit_transform(y_mc_tanh.reshape(-1, 1)) y_mc_std_tanh = scaler.fit_transform(y_mc_std_tanh.reshape(-1, 1)) # Extract full reward function y_mc_relu_reward = torch.from_numpy(y_mc_relu) y_mc_relu_reward = y_mc_relu_reward.reshape(len(y_mc_relu_reward), 1) y_mc_relu_reward = y_mc_relu_reward.repeat((1, 5)) #Solve with learned reward functions y_mc_relu_v, y_mc_relu_q, y_mc_relu_logp, y_mc_relu_P = linearvalueiteration( mdp_data, y_mc_relu_reward) ''' # Print results print("\nTrue R has:\n - negated likelihood: {}\n - EVD: {}".format(trueNLL, irl_model.NLL.calculate_EVD(truep, r))) print("\nPred R with ReLU activation has:\n - negated likelihood: {}\n - EVD: {}".format(irl_model.NLL.apply(y_mc_relu_reward, initD, mu_sa, muE, feature_data['splittable'], mdp_data), irl_model.NLL.calculate_EVD(truep, y_mc_relu_reward))) ''' # Initalise loss function NLL = NLLFunction() # Assign loss function constants NLL.F = feature_data['splittable'] NLL.muE = muE NLL.mu_sa = mu_sa NLL.initD = initD NLL.mdp_data = mdp_data
def torchbasic(self, lh, type_optim): # Initalise params countlist = [] NLLlist = [] gradList = [] estRlist = [] evdList = [] lr = 1 n_epochs = 1000 NLL = 0 prev = 0 diff = 1 threshhold = 0.1 i = 0 # initial estimated R) estR = torch.randn(mdp_data['states'], 1, dtype=torch.float64, requires_grad=True) if(type_optim == 'LBFGS'): optimizer = torch.optim.LBFGS([estR], lr=lr, max_iter=20, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn=None) def closure(): if torch.is_grad_enabled(): optimizer.zero_grad() NLL = lh.negated_likelihood(estR) if NLL.requires_grad: estR.grad = lh.calc_gradient(estR) return NLL print("... minimising likelihood with LBFGS...\n") while (diff >= threshhold): i += 1 prev = NLL NLL = optimizer.step(closure) diff = abs(prev-NLL) print('Optimiser iteration {} with NLL {}, estR values of \n{} and gradient of \n{} and abs diff of {}\n'.format( i, NLL, estR.data, estR.grad, diff)) # store values for plotting evd = lh.calculate_EVD(truep) evdList.append(evd) gradList.append(torch.sum(estR.grad)) NLLlist.append(NLL) countlist.append(i) estRlist.append(torch.sum(estR.data)) else: optimizer = torch.optim.Adam([estR], lr=lr) print("... minimising likelihood with Adam...\n") while (diff >= threshhold): optimizer.zero_grad() i += 1 prev = NLL NLL = lh.negated_likelihood(estR) estR.grad = lh.calc_gradient(estR) optimizer.step() diff = abs(prev-NLL) print('Optimiser iteration {} with NLL {}, estR values of \n{} and gradient of \n{} and abs diff of {}\n'.format( i, NLL, estR.data, estR.grad, diff)) # store values for plotting evd = lh.calculate_EVD(truep) evdList.append(evd) gradList.append(torch.sum(estR.grad)) NLLlist.append(NLL) countlist.append(i) estRlist.append(torch.sum(estR.data)) # Normalise data for plotting NLLlist = [float(i)/sum(NLLlist) for i in NLLlist] gradList = [float(i)/sum(gradList) for i in gradList] estRlist = [float(i)/sum(estRlist) for i in estRlist] # plot f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, sharex=True) ax1.plot(countlist, NLLlist) ax1.set_title('Likelihood') # ax1.xlabel('Iterations') ax2.plot(countlist, gradList) ax2.set_title('grad') # ax2.xlabel('Iterations') ax3.plot(countlist, estRlist) ax3.set_title('estR') # ax3.xlabel('Iterations') ax4.plot(countlist, evdList) ax4.set_title('Expected Value Diff') # ax4.xlabel('Iterations') plt.show() # reshape foundR & find it's likelihood foundR = torch.reshape(torch.tensor(estR.data), (4, 1)) foundR = foundR.repeat(1, 5) print(foundR.dtype) foundLH = lh.negated_likelihood(foundR) # solve MDP with foundR for optimal policy v, q, logp, foundp = linearvalueiteration(mdp_data, foundR) found_optimal_policy = np.argmax(foundp.detach().cpu().numpy(), axis=1) # print print("\nTrue R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format( r, trueNLL, optimal_policy)) foundRprintlist = [foundR, foundLH, found_optimal_policy] print("\nFound R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format( *foundRprintlist))
def linearNN(self, evdThreshold, optim_type): net = LinearNet() tester = testers() # initialise rewards by finding true weights for NN. feed features through NN using true Weights to get ground truth reward. # initalise with some noise? can we still uncover sensible reward # put an l2 regulisariton weight decay on the network weights. fine tune the lambda value # bias = false on weight params seems to work when inital R is 0 # check gradients with torch.gradcheck X = torch.Tensor([[0, 0], [1, 0], [2, 0], [3, 0]]) # for NN(state feature vector) = reward ''' X = torch.Tensor([[0], [1], [2], [3]]) #for (4,4) NN ''' evd = 10 lr = 0.1 finaloutput = None # lists for printing NLList = [] iterations = [] evdList = [] i = 0 if (optim_type == 'Adam'): print('\nOptimising with torch.Adam\n') # inital adam optimiser, weight decay for l2 regularisation optimizer = torch.optim.Adam( net.parameters(), lr=lr, weight_decay=1e-2) while(evd > evdThreshold): net.zero_grad() # build output vector as reward for each state w.r.t its features output = torch.empty(len(X)) indexer = 0 for f in X: thisR = net(f.view(-1, len(f))) output[indexer] = thisR indexer += 1 # get loss from curr output loss = NLL.apply(output, initD, mu_sa, muE, F, mdp_data) # check gradients #tester.checkgradients_NN(output, NLL) #print('Output {} with grad fn {}'.format(output, output.grad_fn)) #print('Loss {} with grad fn {}'.format(loss, loss.grad_fn)) loss.backward() # propagate grad through network evd = NLL.calculate_EVD(truep, output) # calc EVD ''' j = 1 for p in net.parameters(): print('Gradient of parameter {} with shape {} is {}'.format(j, p.shape, p.grad)) j +=1 j = 0 ''' optimizer.step() # Printline when LH is vector #print('{}: output: {} | EVD: {} | loss: {} | {}'.format(i, output.detach().numpy(), evd,loss.detach().numpy(), sum(loss).detach().numpy())) # Printline when LH scalar print('{}: output: {} | EVD: {} | loss: {} '.format( i, output.detach().numpy(), evd, loss.detach().numpy())) # store metrics for printing NLList.append(loss.item()) iterations.append(i) evdList.append(evd.item()) finaloutput = output i += 1 else: print('\nOptimising with torch.LBFGS\n') optimizer = torch.optim.LBFGS(net.parameters(), lr=lr) def closure(): net.zero_grad() output = net(X.view(-1, 4)) # when NLL layer is (4,4) loss = NLL.negated_likelihood(output) loss = sum(loss) evd = NLL.calculate_EVD(truep) print('{}: output: {} | EVD: {} | loss: {}'.format( i, output.detach().numpy(), evd, loss.detach().numpy())) current_gradient = NLL.calc_gradient(output) #print('Current gradient \n{}'.format(current_gradient)) #net.fc1.weight.grad = current_gradient.repeat(1,4) # much worse than above loss.backward(gradient=torch.argmax(current_gradient)) ''' print('Calculated grad \n {}'.format(current_gradient)) j = 1 for p in net.parameters(): print('Gradient of parameter {} \n {}'.format(j, p.grad)) j +=1 j = 0 ''' # store metrics for printing NLList.append(sum(loss).item()) iterations.append(i) evdList.append(evd.item()) finaloutput = output return loss # .max().detach().numpy() for i in range(500): optimizer.step(closure) # Normalise data #NLList = [float(i)/sum(NLList) for i in NLList] #evdList = [float(i)/sum(evdList) for i in evdList] # plot f, (ax1, ax2) = plt.subplots(1, 2, sharex=True) ax1.plot(iterations, NLList) ax1.plot(iterations, NLList, 'r+') ax1.set_title('NLL') ax2.plot(iterations, evdList) ax2.plot(iterations, evdList, 'r+') ax2.set_title('Expected Value Diff') plt.show() # calculate metrics for printing v, q, logp, thisp = linearvalueiteration( mdp_data, output.view(4, 1)) # to get policy under out R thisoptimal_policy = np.argmax(thisp.detach().cpu().numpy(), axis=1) print( '\nTrue R: \n{}\n - with optimal policy {}'.format(r[:, 0].view(4, 1), optimal_policy)) print('\nFinal Estimated R after 100 optim steps: \n{}\n - with optimal policy {}\n - avg EVD of {}'.format( finaloutput.view(4, 1), thisoptimal_policy, sum(evdList)/len(evdList)))
mdp_data, r, feature_data, mdp_params = create_gridworld() elif worldtype == "objectworld" or worldtype == "ow" or worldtype == "obj": print('\n... Creating ObjectWorld ...\n') mdp_data, r, feature_data, true_feature_map, mdp_params = create_objectworld() else: worldtype = 'gridworld' print('\n... Creating GridWorld ...\n') mdp_data, r, feature_data, mdp_params = create_gridworld() if normalise: scaler = MinMaxScaler() r = torch.tensor(scaler.fit_transform(r.data.cpu().numpy())) #Solve MDP print("\n... performing value iteration for v, q, logp and truep ...") v, q, logp, truep = linearvalueiteration(mdp_data, r) mdp_solution = {'v': v, 'q': q, 'p': truep, 'logp': logp} optimal_policy = torch.argmax(truep, axis=1) print("\n... done ...") #Sample paths if new_paths: print("\n... sampling paths from true R ...") example_samples = sampleexamples(N, T, mdp_solution, mdp_data) print("\n... done sampling", N, "paths ...") NLL = NLLFunction() # initialise NLL if new_paths: initD, mu_sa, muE, F, mdp_data = NLL.calc_var_values(mdp_data, N, T, example_samples, feature_data) # calculate required variables else: