save_value5 = save_value1 step_size = 0.1 #we should experiment with step sizes inc = 0.1 #c = 1.0 #decreasing stepsize #print "----- gradient descent ------" for step in range(num_steps): print "iter", step #calculate optimal policy for current estimate of reward pi, V = mdp_solver.policy_iteration(mdp) #print "new policy" #print pi_star #calculate Q values for current estimate of reward Q = mdp_solver.calc_qvals(mdp, pi, V, gamma) #print "new Qvals" #print log-likelihood print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q) save_value1[step] = birl.demo_log_likelihood(demo, Q) #calculate subgradient of posterior wrt reward minus l1 regularization on the reward subgrad = birl.calc_l1regularized_reward_gradient(demo, mdp, mdp.R, lam, eta=1.0) #update stepsize #step_size = c / np.sqrt(step + 1) #print "stepsize", step_size #update reward R_new = mdp.R + step_size * subgrad #print "new reward" #print R_new #update mdp with new reward
writer.write("hi\n") num_steps = 300 #step_size = 0.1 #we should experiment with step sizes c = 1.0 #decreasing stepsize print "----- gradient descent ------" for step in range(num_steps): print "iter", step #calculate optimal policy for current estimate of reward pi, V = mdp_solver.policy_iteration(mdp) #print "new policy" #print pi_star #calculate Q values for current estimate of reward Q = mdp_solver.calc_qvals(mdp, pi, V, gamma) #print "new Qvals" #print log-likelihood print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q) #calculate subgradient of posterior wrt reward minus l1 regularization on the reward subgrad = birl.calc_l1regularized_reward_gradient(demo, mdp, mdp.R, lam, eta=1.0) #update stepsize step_size = c / np.sqrt(step + 2) print "stepsize", step_size #update reward R_new = mdp.R + step_size * subgrad #print "new reward" #print R_new #update mdp with new reward
f.write("iter\treward-diff\treward-norm\tpolicy-diff\tlqog-lik\n") num_steps = 500 #step_size = 0.1 #we should experiment with step sizes c = 0.5 #decreasing stepsize print "----- gradient descent ------" for step in range(num_steps): #print "iter",step #calculate optimal policy for current estimate of reward pi, V = mdp_solver.policy_iteration(mdp) #print "new policy" #print pi_star #calculate Q values for current estimate of reward Q = mdp_solver.calc_qvals(mdp, pi, V, gamma) #print "new Qvals" #print log-likelihood log_lik = birl.demo_log_likelihood(demo, Q) #print "log-likelihood posterior", log_lik #calculate subgradient of posterior wrt reward minus l1 regularization on the reward subgrad = birl.calc_l1regularized_reward_gradient(demo, mdp, mdp.R, lam, eta=1.0) #update stepsize step_size = c / np.sqrt(step + 2) #print "stepsize", step_size #update reward R_new = mdp.R + step_size * subgrad #print "new reward" #print R_new