step_stdev, debug=False, mcmc_norm=mcmc_norm) map_w, map_u_sa, w_chain, u_chain = birl.sample_posterior( demonstrations, num_samples, False) print("Birl complete") if debug: print("-------") print("true weights", true_w) print("features") utils.display_onehot_state_features(mdp_env) print("optimal policy") utils.print_policy_from_occupancies(opt_u_sa, mdp_env) print("optimal values") v = mdp.get_state_values(opt_u_sa, mdp_env) utils.print_as_grid(v, mdp_env) if debug: print("map_weights", map_w) map_r = np.dot(mdp_env.state_features, map_w) print("MAP reward") utils.print_as_grid(map_r, mdp_env) print("Map policy") utils.print_policy_from_occupancies(map_u_sa, mdp_env) w_chain_burned = w_chain[burn::skip] ###compute mean reward policy mean_w = np.mean(w_chain_burned, axis=0)
weights = utils.sample_l2_ball(num_features) print("weights", weights) gamma = 0.99 #let's look at all starting states for now init_dist = np.ones(num_states) / num_states # init_states = [10] # for si in init_states: # init_dist[si] = 1.0 / len(init_states) #no terminal term_states = [] mdp_env = mdp.FeaturizedGridMDP(num_rows, num_cols, state_features, weights, gamma, init_dist, term_states) return mdp_env if __name__ == "__main__": #mdp_env = machine_teaching_toy_featurized() # mdp_env = lava_ambiguous_aaai18() mdp_env = random_gridworld_corner_terminal(6, 6, 5) print("features") utils.display_onehot_state_features(mdp_env) u_sa = mdp.solve_mdp_lp(mdp_env, debug=True) print("optimal policy") utils.print_policy_from_occupancies(u_sa, mdp_env) print("optimal values") v = mdp.get_state_values(u_sa, mdp_env) utils.print_as_grid(v, mdp_env)
init_distribution = np.ones(num_states)/num_states #uniform distribution mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma, init_distribution) #print(mdp_env.Ps) print("mean MDP reward", r_sa) u_sa = mdp.solve_mdp_lp(mdp_env, debug=True) print("mean policy from posterior") utils.print_stochastic_policy_action_probs(u_sa, mdp_env) print("MAP/Mean policy from posterior") utils.print_policy_from_occupancies(u_sa, mdp_env) print("rewards") print(mdp_env.r_sa) print("expected value = ", np.dot(u_sa, r_sa)) stoch_pi = utils.get_optimal_policy_from_usa(u_sa, mdp_env) print("expected return", mdp.get_policy_expected_return(stoch_pi, mdp_env)) print("values", mdp.get_state_values(u_sa, mdp_env)) print('q-values', mdp.get_q_values(u_sa, mdp_env)) #print(posterior) #print(posterior.shape) #run CVaR optimization, maybe just the robust version for now u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states) # print("solving for CVaR optimal policy") posterior_probs = np.ones(num_samples) / num_samples #uniform dist since samples from MCMC # cvar_opt_usa, cvar, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, posterior, posterior_probs, alpha, False, lamda)