def calc_frontier(mdp_env, u_expert, reward_posterior, posterior_probs, lambda_range, alpha, debug=False): '''takes an MDP and runs over a range of lambdas to output the expected value and CVaR of the resulting solutions to the LP mdp_env: the mdp to run on u_expert: the baseline expert to try and beat (set to zeros just to be robust) reward_posterior: the reward posterior from B-IRL(already burned and skiped and ready to run in LP) posterior_probs: the probabilities of each element in the posterior (uniform if from MCMC) lambda_range: a list of lambda values to try alpha: the CVaR alpha (risk sensitivity) higher is more risk-sensitive/conservative ''' cvar_exprews = [] for lamda in lambda_range: cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, reward_posterior, posterior_probs, alpha, debug, lamda) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env) print("stochastic policy") utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env) print("CVaR of policy = {}".format(cvar_value)) print("Expected return of policy = {}".format(exp_ret)) cvar_exprews.append((cvar_value, exp_ret)) return cvar_exprews
#Now let's see what CVaR optimization does. alpha = 0.99 debug = False lamda = 0.0 r_chain_burned = r_chain[burn::skip] n = r_chain_burned.shape[0] posterior_probs = np.ones(n) / n #uniform dist since samples from MCMC print("MDP A") print("features") utils.display_onehot_state_features(mdp_env_A) print("------ Robust Solution ---------") u_expert = np.zeros(mdp_env_B.num_actions * mdp_env_B.num_states) cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy( mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env_A) print("solving for CVaR reward") cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha) # print("cvar reward weights", cvar_reward) print("cvar reward weights", np.dot(q, r_chain_burned)) print("------ Regret Solution ---------") traj_demonstrations = [demonstrations] u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env_A) print('expert u_sa', u_expert)
#input() worst_index = np.argmin(r_chain_burned[:, 1]) print(r_chain_burned[worst_index]) print(np.sum(r_chain_burned[:, 1] < -0.82), "out of", len(r_chain_burned)) #input() print("MAP policy") utils.print_policy_from_occupancies(map_u, mdp_env) #let's actually try using the optimal policy to get the feature counts and see if the regret method works? u_expert = u_sa alpha = 0.95 n = r_chain_burned.shape[0] posterior_probs = np.ones(n) / n #uniform dist since samples from MCMC cvar_opt_usa_regret, cvar, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, False) print("{}-CVaR policy regret optimal u_E".format(alpha)) utils.print_policy_from_occupancies(cvar_opt_usa_regret, mdp_env) cvar_2, exp_ret2 = mdp.solve_cvar_expret_fixed_policy( mdp_env, cvar_opt_usa_regret, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug=False) print(cvar, cvar_2) print(exp_ret, exp_ret2) input("same?")
posterior = generate_posterior_samples(num_samples, num_states) r_sa = np.mean(posterior, axis=1) #print("rsa", r_sa) init_distribution = np.ones( num_states) / num_states #uniform distribution mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma, init_distribution) #run CVaR optimization, just the robust version since we don't have demos u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states) # print("solving for CVaR optimal policy") posterior_probs = np.ones( num_samples ) / num_samples #uniform dist since samples from MCMC import time t = time.time() cvar_opt_usa, cvar, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, posterior, posterior_probs, alpha, False, lamda) run_times[rep, i] = time.time() - t print(run_times) print(np.mean(run_times, axis=0)) print(np.std(run_times, axis=0)) import os if not os.path.exists('./results/stress_test/'): os.makedirs('./results/stress_test/') np.savetxt("./results/stress_test/machine_replace_states.csv", run_times, delimiter=",")
###run CVaR IRL to get policy print("optimizing CVAR") #running just the robust version for now traj_demonstrations = [demonstrations] u_expert = utils.u_sa_from_demos( traj_demonstrations, mdp_env) #np.zeros(mdp_env.num_actions * mdp_env.num_states) n = w_chain_burned.shape[0] posterior_probs = np.ones(n) / n #uniform dist since samples from MCMC cvar_losses = [] for lamda in lamdas: cvar_u_sa, cvar, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, w_chain_burned.transpose(), posterior_probs, alpha, False, lamda=lamda) if debug: print("CVaR policy") utils.print_policy_from_occupancies(cvar_u_sa, mdp_env) cvar_ploss = utils.policy_loss(cvar_u_sa, mdp_env, opt_u_sa) cvar_losses.append(cvar_ploss) cvar_ploss_str = "" for loss in cvar_losses: cvar_ploss_str += ", {}".format(loss) print("cvar = {}".format(cvar_ploss_str))
debug = False n = r_chain_burned.shape[0] print("num reward hypothesis", n) posterior_probs = np.ones(n) / n #uniform dist since samples from MCMC traj_demonstrations = [demonstrations] u_expert = utils.u_sa_from_demos(traj_demonstrations, train_mdp) print("u expert", u_expert) run_times = [] for rep in range(num_reps): print(rep) t = time.time() regret_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy( test_mdp, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, test_mdp_A) elapsed = time.time() - t run_times.append(elapsed) print(elapsed) #save run times # import os if not os.path.exists('./results/stress_test/'): os.makedirs('./results/stress_test/') np.savetxt("./results/stress_test/grid_world_60_60.csv", run_times, delimiter=",")