Пример #1
0
save_value5 = save_value1
step_size = 0.1  #we should experiment with step sizes
inc = 0.1
#c = 1.0 #decreasing stepsize
#print "----- gradient descent ------"
for step in range(num_steps):
    print "iter", step
    #calculate optimal policy for current estimate of reward
    pi, V = mdp_solver.policy_iteration(mdp)
    #print "new policy"
    #print pi_star
    #calculate Q values for current estimate of reward
    Q = mdp_solver.calc_qvals(mdp, pi, V, gamma)
    #print "new Qvals"
    #print log-likelihood
    print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q)
    save_value1[step] = birl.demo_log_likelihood(demo, Q)
    #calculate subgradient of posterior wrt reward minus l1 regularization on the reward
    subgrad = birl.calc_l1regularized_reward_gradient(demo,
                                                      mdp,
                                                      mdp.R,
                                                      lam,
                                                      eta=1.0)
    #update stepsize
    #step_size = c / np.sqrt(step + 1)
    #print "stepsize", step_size
    #update reward
    R_new = mdp.R + step_size * subgrad
    #print "new reward"
    #print R_new
    #update mdp with new reward
Пример #2
0
writer.write("hi\n")
num_steps = 300
#step_size = 0.1 #we should experiment with step sizes
c = 1.0  #decreasing stepsize
print "----- gradient descent ------"
for step in range(num_steps):
    print "iter", step
    #calculate optimal policy for current estimate of reward
    pi, V = mdp_solver.policy_iteration(mdp)
    #print "new policy"
    #print pi_star
    #calculate Q values for current estimate of reward
    Q = mdp_solver.calc_qvals(mdp, pi, V, gamma)
    #print "new Qvals"
    #print log-likelihood
    print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q)

    #calculate subgradient of posterior wrt reward minus l1 regularization on the reward
    subgrad = birl.calc_l1regularized_reward_gradient(demo,
                                                      mdp,
                                                      mdp.R,
                                                      lam,
                                                      eta=1.0)
    #update stepsize
    step_size = c / np.sqrt(step + 2)
    print "stepsize", step_size
    #update reward
    R_new = mdp.R + step_size * subgrad
    #print "new reward"
    #print R_new
    #update mdp with new reward
    f.write("iter\treward-diff\treward-norm\tpolicy-diff\tlqog-lik\n")
    num_steps = 500
    #step_size = 0.1 #we should experiment with step sizes
    c = 0.5  #decreasing stepsize
    print "----- gradient descent ------"
    for step in range(num_steps):
        #print "iter",step
        #calculate optimal policy for current estimate of reward
        pi, V = mdp_solver.policy_iteration(mdp)
        #print "new policy"
        #print pi_star
        #calculate Q values for current estimate of reward
        Q = mdp_solver.calc_qvals(mdp, pi, V, gamma)
        #print "new Qvals"
        #print log-likelihood
        log_lik = birl.demo_log_likelihood(demo, Q)
        #print "log-likelihood posterior", log_lik

        #calculate subgradient of posterior wrt reward minus l1 regularization on the reward
        subgrad = birl.calc_l1regularized_reward_gradient(demo,
                                                          mdp,
                                                          mdp.R,
                                                          lam,
                                                          eta=1.0)
        #update stepsize
        step_size = c / np.sqrt(step + 2)
        #print "stepsize", step_size
        #update reward
        R_new = mdp.R + step_size * subgrad
        #print "new reward"
        #print R_new