Exemplo n.º 1
0
    def NLL_with_grad(self, fW):
        #Returns NLL w.r.t input r

        if (torch.is_tensor(fW) == False):
            lh_r = torch.tensor(fW)  #cast to tensor
        if (lh_r.shape != (self.mdp_data['states'], 5)):
            #convert to full reward
            r = torch.matmul(self.F, lh_r)
            r = torch.reshape(r, (len(r), 1))
            r = r.repeat((1, 5))

        if r.shape != (int(mdp_data['states']), 5):
            print(r.shape)
            raise Exception("Reward shape not (states, 5)")

        #Solve MDP with current reward
        v, q, logp, p = linearvalueiteration(self.mdp_data, r)

        #Calculate likelihood from logp
        likelihood = torch.empty(self.mu_sa.shape, requires_grad=True)
        likelihood = torch.sum(torch.sum(logp * self.mu_sa))

        #Calculate gradient

        #Calc gradient w.r.t to forward inputs
        D = torch.from_numpy(fW)
        D = linearmdpfrequency(self.mdp_data, p,
                               self.initD)  #Compute state visitation count D
        D = D.clone().detach().requires_grad_(True)  #cast to tensor

        dr = self.muE - torch.matmul(torch.t(self.F), D)  #Compute gradient

        return -likelihood.detach().numpy(), -dr.detach().numpy()
Exemplo n.º 2
0
def likelihood(r, initD, mu_sa, muE, F, mdp_data):
    #Returns NLL w.r.t input r
    '''
    if(torch.is_tensor(r) == False):
        r = torch.tensor(r) #cast to tensor
    if(r.shape != (mdp_data['states'],5)):
        #reformat to be in shape (states,actions)
        r = torch.reshape(r, (int(mdp_data['states']),1))
        r = r.repeat((1, 5))
    '''

    if (torch.is_tensor(r) == False):
        r = torch.tensor(r)  #cast to tensor
    if (r.shape != (mdp_data['states'], 5)):
        #convert to full reward
        r = torch.matmul(F, r)

    #Solve MDP with current reward
    v, q, logp, p = linearvalueiteration(mdp_data, r)

    #Calculate likelihood from logp
    likelihood = torch.empty(mu_sa.shape, requires_grad=True)

    likelihood = torch.sum(torch.sum(logp * mu_sa))  #for scalar likelihood

    #LH for each state as tensor size (states,1)
    #mul = logp*mu_sa #hold
    #likelihood = torch.sum(mul, dim=1)
    #likelihood.requires_grad = True

    return -likelihood
    def calculate_EVD(self, trueP, currR):
        if(currR.shape != (len(currR),5)):
            currR = currR.repeat((1, 5))

        if currR.shape != trueP.shape:
            raise Exception("Reward shapenot (states, 5) instead it's", + str(currR.shape))

        v, q, logp, currP = linearvalueiteration(self.mdp_data, currR)
        #Expected Value Diff = diff in policies since exact True R values never actually learned, only it's structure
        evd=torch.max(torch.abs(currP-trueP))
        return evd
    def backward(ctx, grad_output):
        #print('Grad output {}'.format(grad_output))
        #Should return as many gradient tesnors w.r.t to inputs

        r, initD, mu_sa, muE, F, sa_p, sa_s, states, actions, discount, determinism = ctx.saved_tensors

        #reconsrtuct mdp_data since ctx can't save dicts
        mdp_data = {
            'states':states, 
            'actions':actions, 
            'discount':discount, 
            'determinism':determinism,
            'sa_s':sa_s,
            'sa_p':sa_p
            }

        '''
        if(torch.is_tensor(r) == False):
            r = torch.tensor(r) #cast to tensor
        if(r.shape != (mdp_data['states'],5)):
            #reformat to be in shape (states,actions)
            r = torch.reshape(r, (int(mdp_data['states']),1))
            r = r.repeat((1, 5))
        '''

        if(torch.is_tensor(r) == False):
            r = torch.tensor(r) #cast to tensor
        if(r.shape != (mdp_data['states'],5)):
            #convert to full reward
            r = torch.matmul(F, r)
            r = r.repeat((1, 5))
       # '''

        if r.shape != (int(mdp_data['states']),5):
            raise Exception("Reward shapenot (states, 5) instead it's", + str(r.shape))

        #Solve MDP with current reward
        v, q, logp, p = linearvalueiteration(mdp_data, r) 
        
        #Calc gradient w.r.t to forward inputs 
        D = grad_output.clone()
        D = linearmdpfrequency(mdp_data,p,initD)#Compute state visitation count D
        D = D.clone().detach().requires_grad_(True) #cast to tensor
        #D = torch.tensor(D.clone().detach().requires_grad_(True)) #Cast to tensor
        
        dr = muE - torch.matmul(torch.t(F),D) #Compute gradient
        #print('gradient:\n',-dr)

        #dr = dr.repeat((1, 5)) #make shape [states, 5]

        #dr = dr.view(len(dr)) #Make row vector

        return -dr, None, None, None, None, None #return -dr for descent 
    def forward(ctx, r, initD, mu_sa, muE, F, mdp_data):
        #Returns NLL w.r.t input r

        ctx.save_for_backward(r, initD, mu_sa, muE, F, mdp_data['sa_p'], mdp_data['sa_s'], torch.tensor(mdp_data['states']), torch.tensor(mdp_data['actions']), torch.tensor(mdp_data['discount']), torch.tensor(mdp_data['determinism']))
       
        '''
        if(torch.is_tensor(r) == False):
            r = torch.tensor(r) #cast to tensor
        if(r.shape != (mdp_data['states'],5)):
            #reformat to be in shape (states,actions)
            r = torch.reshape(r, (int(mdp_data['states']),1))
            r = r.repeat((1, 5))
        '''

        if(torch.is_tensor(r) == False):
            r = torch.tensor(r) #cast to tensor
        if(r.shape != (mdp_data['states'],5)):
            #convert to full reward
            r = torch.matmul(F, r)
            r = r.repeat((1, 5))
        
        #'''

        if r.shape != (int(mdp_data['states']),5):
            raise Exception("Reward shape not (states, 5) instead it's", + str(r.shape))

        #Solve MDP with current reward
        v, q, logp, p = linearvalueiteration(mdp_data, r) 
   


        
        #Calculate likelihood from logp
        likelihood = torch.empty(mu_sa.shape, requires_grad=True)


        likelihood = torch.sum(torch.sum(logp*mu_sa)) #for scalar likelihood




        #LH for each state as tensor size (states,1)
        #mul = logp*mu_sa #hold
        #likelihood = torch.sum(mul, dim=1)
        #likelihood.requires_grad = True
        
      
        return -likelihood
    def forward(self, r):
        print('Init D {}'.format(initD))
        #Returns NLL w.r.t input r
        r = self.reshapeReward(r)
        self.initD = torch.reshape(self.initD, (self.mdp_data['states'], 1))
        v, q, logp, p = linearvalueiteration(self.mdp_data,
                                             r)  #Solve MDP with current reward
        self.p = p  #set policy w.r.t current r for backward

        #Calculate likelihood from logp
        #torch.sum
        likelihood = sum(sum(logp * self.mu_sa))  #for scalar likelihood

        #mul = logp*self.mu_sa #hold
        #likelihood = torch.sum(mul, dim=1)#likelihood for each state as tensor size (states,1)
        #likelihood = likelihood.view(self.mdp_data['states'],1) #make column vector
        #likelihood.requires_grad = True
        return -likelihood
    def scipy(self, lh):

        estR = np.random.randn(mdp_params['n']**2, 1)  # initial estimated R
        res = minimize(lh.negated_likelihood_with_grad, estR, jac=True, method="L-BFGS-B", options={
                       'disp': True, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 15000, 'ftol': 2.220446049250313e-09, 'maxcor': 10, 'maxfun': 15000})
        # reshape foundR & find it's likelihood
        foundR = torch.reshape(torch.tensor(res.x), (4, 1))
        foundR = foundR.repeat(1, 5)
        print(foundR.dtype)
        foundLH = lh.negated_likelihood(foundR)

        # solve MDP with foundR for optimal policy
        v, q, logp, foundp = linearvalueiteration(mdp_data, foundR)
        found_optimal_policy = np.argmax(foundp.detach().cpu().numpy(), axis=1)

        print("\nTrue R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format(
            *trueRprintlist))

        # Print found R stats
        foundRprintlist = [foundR, foundLH, found_optimal_policy]
        print("\nFound R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format(
            *foundRprintlist))
    if normalise:
        #Scale everything within 0 and 1
        scaler = MinMaxScaler()
        y_mc_relu = scaler.fit_transform(y_mc_relu.reshape(-1, 1))
        y_mc_std_relu = scaler.fit_transform(y_mc_std_relu.reshape(-1, 1))
        y_mc_tanh = scaler.fit_transform(y_mc_tanh.reshape(-1, 1))
        y_mc_std_tanh = scaler.fit_transform(y_mc_std_tanh.reshape(-1, 1))

    # Extract full reward function
    y_mc_relu_reward = torch.from_numpy(y_mc_relu)
    y_mc_relu_reward = y_mc_relu_reward.reshape(len(y_mc_relu_reward), 1)
    y_mc_relu_reward = y_mc_relu_reward.repeat((1, 5))

    #Solve with learned reward functions
    y_mc_relu_v, y_mc_relu_q, y_mc_relu_logp, y_mc_relu_P = linearvalueiteration(
        mdp_data, y_mc_relu_reward)
    '''
    # Print results
    print("\nTrue R has:\n - negated likelihood: {}\n - EVD: {}".format(trueNLL,  irl_model.NLL.calculate_EVD(truep, r)))
    print("\nPred R with ReLU activation has:\n - negated likelihood: {}\n - EVD: {}".format(irl_model.NLL.apply(y_mc_relu_reward, initD, mu_sa, muE, feature_data['splittable'], mdp_data), irl_model.NLL.calculate_EVD(truep, y_mc_relu_reward)))
    '''

    # Initalise loss function
    NLL = NLLFunction()
    # Assign loss function constants
    NLL.F = feature_data['splittable']
    NLL.muE = muE
    NLL.mu_sa = mu_sa
    NLL.initD = initD
    NLL.mdp_data = mdp_data
    def torchbasic(self, lh, type_optim):

        # Initalise params

        countlist = []
        NLLlist = []
        gradList = []
        estRlist = []
        evdList = []
        lr = 1
        n_epochs = 1000
        NLL = 0
        prev = 0
        diff = 1
        threshhold = 0.1
        i = 0
        # initial estimated R)
        estR = torch.randn(mdp_data['states'], 1,
                           dtype=torch.float64, requires_grad=True)
        if(type_optim == 'LBFGS'):
            optimizer = torch.optim.LBFGS([estR], lr=lr, max_iter=20, max_eval=None,
                                          tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn=None)

            def closure():
                if torch.is_grad_enabled():
                    optimizer.zero_grad()
                NLL = lh.negated_likelihood(estR)
                if NLL.requires_grad:
                    estR.grad = lh.calc_gradient(estR)
                return NLL
            print("... minimising likelihood with LBFGS...\n")
            while (diff >= threshhold):
                i += 1
                prev = NLL
                NLL = optimizer.step(closure)
                diff = abs(prev-NLL)
                print('Optimiser iteration {} with NLL {}, estR values of \n{} and gradient of \n{} and abs diff of {}\n'.format(
                    i, NLL, estR.data, estR.grad, diff))
                # store values for plotting
                evd = lh.calculate_EVD(truep)
                evdList.append(evd)
                gradList.append(torch.sum(estR.grad))
                NLLlist.append(NLL)
                countlist.append(i)
                estRlist.append(torch.sum(estR.data))

        else:
            optimizer = torch.optim.Adam([estR], lr=lr)
            print("... minimising likelihood with Adam...\n")
            while (diff >= threshhold):
                optimizer.zero_grad()
                i += 1
                prev = NLL
                NLL = lh.negated_likelihood(estR)
                estR.grad = lh.calc_gradient(estR)
                optimizer.step()
                diff = abs(prev-NLL)
                print('Optimiser iteration {} with NLL {}, estR values of \n{} and gradient of \n{} and abs diff of {}\n'.format(
                    i, NLL, estR.data, estR.grad, diff))  # store values for plotting
                evd = lh.calculate_EVD(truep)
                evdList.append(evd)
                gradList.append(torch.sum(estR.grad))
                NLLlist.append(NLL)
                countlist.append(i)
                estRlist.append(torch.sum(estR.data))

        # Normalise data for plotting
        NLLlist = [float(i)/sum(NLLlist) for i in NLLlist]
        gradList = [float(i)/sum(gradList) for i in gradList]
        estRlist = [float(i)/sum(estRlist) for i in estRlist]

        # plot
        f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, sharex=True)
        ax1.plot(countlist, NLLlist)
        ax1.set_title('Likelihood')
        # ax1.xlabel('Iterations')
        ax2.plot(countlist, gradList)
        ax2.set_title('grad')
        # ax2.xlabel('Iterations')
        ax3.plot(countlist, estRlist)
        ax3.set_title('estR')
        # ax3.xlabel('Iterations')
        ax4.plot(countlist, evdList)
        ax4.set_title('Expected Value Diff')
        # ax4.xlabel('Iterations')
        plt.show()

        # reshape foundR & find it's likelihood
        foundR = torch.reshape(torch.tensor(estR.data), (4, 1))
        foundR = foundR.repeat(1, 5)
        print(foundR.dtype)
        foundLH = lh.negated_likelihood(foundR)

        # solve MDP with foundR for optimal policy
        v, q, logp, foundp = linearvalueiteration(mdp_data, foundR)
        found_optimal_policy = np.argmax(foundp.detach().cpu().numpy(), axis=1)

        # print
        print("\nTrue R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format(
            r, trueNLL, optimal_policy))
        foundRprintlist = [foundR, foundLH, found_optimal_policy]
        print("\nFound R is \n{}\n with negated likelihood of {}\n and optimal policy {}\n".format(
            *foundRprintlist))
    def linearNN(self, evdThreshold, optim_type):
        net = LinearNet()
        tester = testers()

        # initialise rewards by finding true weights for NN. feed features through NN using true Weights to get ground truth reward.

        # initalise with some noise? can we still uncover sensible reward

        # put an l2 regulisariton weight decay on the network weights. fine tune the lambda value
        #  bias = false on weight params seems to work when inital R is 0

        # check gradients with torch.gradcheck

        X = torch.Tensor([[0, 0],
                          [1, 0],
                          [2, 0],
                          [3, 0]])  # for NN(state feature vector) = reward

        '''
		X = torch.Tensor([[0],
				  [1],
				  [2],
				  [3]]) #for (4,4) NN
		'''

        evd = 10
        lr = 0.1
        finaloutput = None
        # lists for printing
        NLList = []
        iterations = []
        evdList = []
        i = 0

        if (optim_type == 'Adam'):
            print('\nOptimising with torch.Adam\n')
            # inital adam optimiser, weight decay for l2 regularisation
            optimizer = torch.optim.Adam(
                net.parameters(), lr=lr, weight_decay=1e-2)
            while(evd > evdThreshold):
                net.zero_grad()

                # build output vector as reward for each state w.r.t its features
                output = torch.empty(len(X))
                indexer = 0
                for f in X:
                    thisR = net(f.view(-1, len(f)))
                    output[indexer] = thisR
                    indexer += 1

                # get loss from curr output
                loss = NLL.apply(output, initD, mu_sa, muE, F, mdp_data)

                # check gradients
                #tester.checkgradients_NN(output, NLL)

                #print('Output {} with grad fn {}'.format(output, output.grad_fn))
                #print('Loss {} with grad fn {}'.format(loss, loss.grad_fn))

                loss.backward()  # propagate grad through network
                evd = NLL.calculate_EVD(truep, output)  # calc EVD
                '''
				j = 1
				for p in net.parameters():
					print('Gradient of parameter {} with shape {} is {}'.format(j, p.shape, p.grad))
					j +=1
				j = 0
				'''

                optimizer.step()

                # Printline when LH is vector
                #print('{}: output: {} | EVD: {} | loss: {} | {}'.format(i, output.detach().numpy(), evd,loss.detach().numpy(), sum(loss).detach().numpy()))
                # Printline when LH scalar
                print('{}: output: {} | EVD: {} | loss: {} '.format(
                    i, output.detach().numpy(), evd, loss.detach().numpy()))

                # store metrics for printing
                NLList.append(loss.item())
                iterations.append(i)
                evdList.append(evd.item())
                finaloutput = output
                i += 1
        else:
            print('\nOptimising with torch.LBFGS\n')
            optimizer = torch.optim.LBFGS(net.parameters(), lr=lr)

            def closure():
                net.zero_grad()
                output = net(X.view(-1, 4))  # when NLL layer is (4,4)
                loss = NLL.negated_likelihood(output)
                loss = sum(loss)
                evd = NLL.calculate_EVD(truep)
                print('{}: output: {} | EVD: {} | loss: {}'.format(
                    i, output.detach().numpy(), evd, loss.detach().numpy()))
                current_gradient = NLL.calc_gradient(output)
                #print('Current gradient \n{}'.format(current_gradient))

                #net.fc1.weight.grad = current_gradient.repeat(1,4)
                # much worse than above
                loss.backward(gradient=torch.argmax(current_gradient))
                '''												 
				print('Calculated grad \n {}'.format(current_gradient))
				j = 1
				for p in net.parameters():
					print('Gradient of parameter {} \n {}'.format(j, p.grad))
					j +=1
				j = 0
				'''

                # store metrics for printing
                NLList.append(sum(loss).item())
                iterations.append(i)
                evdList.append(evd.item())
                finaloutput = output
                return loss  # .max().detach().numpy()
            for i in range(500):
                optimizer.step(closure)

        # Normalise data
        #NLList = [float(i)/sum(NLList) for i in NLList]
        #evdList = [float(i)/sum(evdList) for i in evdList]

        # plot
        f, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
        ax1.plot(iterations, NLList)
        ax1.plot(iterations, NLList, 'r+')
        ax1.set_title('NLL')

        ax2.plot(iterations, evdList)
        ax2.plot(iterations, evdList, 'r+')
        ax2.set_title('Expected Value Diff')
        plt.show()

        # calculate metrics for printing
        v, q, logp, thisp = linearvalueiteration(
            mdp_data, output.view(4, 1))  # to get policy under out R
        thisoptimal_policy = np.argmax(thisp.detach().cpu().numpy(), axis=1)

        print(
            '\nTrue R: \n{}\n - with optimal policy {}'.format(r[:, 0].view(4, 1), optimal_policy))
        print('\nFinal Estimated R after 100 optim steps: \n{}\n - with optimal policy {}\n - avg EVD of {}'.format(
            finaloutput.view(4, 1), thisoptimal_policy, sum(evdList)/len(evdList)))
        mdp_data, r, feature_data, mdp_params = create_gridworld()
    elif worldtype == "objectworld" or worldtype == "ow" or worldtype == "obj":
        print('\n... Creating ObjectWorld ...\n')
        mdp_data, r, feature_data, true_feature_map, mdp_params = create_objectworld()
else:
    worldtype = 'gridworld'
    print('\n... Creating GridWorld ...\n')
    mdp_data, r, feature_data, mdp_params = create_gridworld()

if normalise:
    scaler = MinMaxScaler()
    r = torch.tensor(scaler.fit_transform(r.data.cpu().numpy()))

#Solve MDP
print("\n... performing value iteration for v, q, logp and truep ...")
v, q, logp, truep = linearvalueiteration(mdp_data, r)
mdp_solution = {'v': v, 'q': q, 'p': truep, 'logp': logp}
optimal_policy = torch.argmax(truep, axis=1)
print("\n... done ...")

#Sample paths
if new_paths:
    print("\n... sampling paths from true R ...")
    example_samples = sampleexamples(N, T, mdp_solution, mdp_data)
    print("\n... done sampling", N, "paths ...")


NLL = NLLFunction()  # initialise NLL
if new_paths:
    initD, mu_sa, muE, F, mdp_data = NLL.calc_var_values(mdp_data, N, T, example_samples, feature_data)  # calculate required variables
else: