Пример #1
0
print "----- gradient descent ------"
for step in range(num_steps):
    flag = 0
    # Variable that checks whether the BTLS iteration should finish or not
    #calculate optimal policy for current estimate of reward
    pi, V = mdp_solver.policy_iteration(mdp)
    #print "new policy"
    #print pi_star
    #calculate Q values for current estimate of reward
    Q = mdp_solver.calc_qvals(mdp, pi, V, gamma)
    #print "new Qvals"
    #print log-likelihood
    print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q)

    #calculate gradient of posterior wrt reward
    grad = birl.calc_reward_gradient(demo, mdp, mdp.R, eta=1.0)
    while flag == 0:
        R_temp = mdp_temp.R + step_size * grad
        mdp_temp.set_reward(R_temp)
        pi_temp, V_temp = mdp_solver.policy_iteration(mdp_temp)
        Q_temp = mdp_solver.calc_qvals(mdp_temp, pi_temp, V_temp, gamma)
        func = birl.demo_log_likelihood(demo, Q_temp)
        approx = birl.demo_log_likelihood(
            demo, Q) + alpha * step_size * pow(la.norm(grad, 2), 2)
        if func >= approx:
            step_size = beta * step_size
    #update reward
    R_new = mdp.R + step_size * grad
    #print "new reward"
    #print R_new
    #update mdp with new reward
Q_star = mdp_solver.calc_qvals(simple_world, pi_star, V_star, gamma)
print "q-vals"
print Q_star

#give optimal action in each state as demonstration
demo = [(state, np.argmax(Q_star[state, :]))
        for state in range(simple_world.num_states)]
print demo

#compute the gradient of R_guess
#TODO get an actual guess and update it towards real R
num_states = simple_world.num_states
num_actions = simple_world.num_actions

print "gradient"
print birl.calc_reward_gradient(demo, simple_world, simple_world.R, eta=1.0)

#test out the log posterior

print "log-likelihood true reward", birl.demo_log_likelihood(demo, Q_star)
reward2 = np.reshape([[0, 0, 0], [0, -1, 0], [-1, -1, 1]], (num_states, 1))
#print reward2
#set reward to false values
simple_world.set_reward(reward2)
#calculate new policy
pi_star, V_star = mdp_solver.policy_iteration(simple_world)
print "false reward"
util.print_reward(simple_world)
util.print_policy(simple_world, pi_star)
Q_star = mdp_solver.calc_qvals(simple_world, pi_star, V_star, gamma)
print "log-likelihood false reward", birl.demo_log_likelihood(demo, Q_star)