def calc_frontier(mdp_env, u_expert, reward_posterior, posterior_probs, lambda_range, alpha, debug=False): '''takes an MDP and runs over a range of lambdas to output the expected value and CVaR of the resulting solutions to the LP mdp_env: the mdp to run on u_expert: the baseline expert to try and beat (set to zeros just to be robust) reward_posterior: the reward posterior from B-IRL(already burned and skiped and ready to run in LP) posterior_probs: the probabilities of each element in the posterior (uniform if from MCMC) lambda_range: a list of lambda values to try alpha: the CVaR alpha (risk sensitivity) higher is more risk-sensitive/conservative ''' cvar_exprews = [] for lamda in lambda_range: cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, reward_posterior, posterior_probs, alpha, debug, lamda) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env) print("stochastic policy") utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env) print("CVaR of policy = {}".format(cvar_value)) print("Expected return of policy = {}".format(exp_ret)) cvar_exprews.append((cvar_value, exp_ret)) return cvar_exprews
def calc_max_ent_u_sa(mdp_env, demos, max_epochs=1000, horizon=None, learning_rate=0.01): import mdp import utils seed_weights = np.zeros(mdp_env.get_reward_dimensionality()) # Parameters if horizon is None: horizon = mdp_env.num_states # Main algorithm call r_weights, grads, state_features, maxent_pi = maxEntIRL(mdp_env, demos, seed_weights, max_epochs, horizon, learning_rate, norm="l2") # Construct reward function from weights and state features reward_fxn = [] for s_i in range(mdp_env.num_states): reward_fxn.append(np.dot(r_weights, state_features[s_i])) reward_fxn = np.reshape(reward_fxn, (mdp_env.num_rows, mdp_env.num_cols)) print("learned reward function") print(reward_fxn) u_s = mdp.get_policy_state_occupancy_frequencies(maxent_pi, mdp_env) u_sa = mdp.stoch_policy_to_usa(maxent_pi, mdp_env) utils.print_policy_from_occupancies(u_sa, mdp_env) utils.print_stochastic_policy_action_probs(u_sa, mdp_env) return u_sa, r_weights, maxent_pi
seed = 12131 np.random.seed(seed) random.seed(seed) #let's try out BIRL on a simpler version and see what happens #first let's give a demo in the A version that doesn't have lava mdp_env_A = mdp_worlds.lava_ird_simplified_a() # mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a() u_sa_A = mdp.solve_mdp_lp(mdp_env_A) print("mdp A") print("Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demo for Dylan's NeurIPS world # demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A) #generate demo for my simplified world demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A) print("demonstration") print(demonstrations) #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens.
worst_index = np.argmin(r_chain[:, 1]) print(r_chain[worst_index]) print(np.sum(r_chain[:, 1] < -0.82), "out of ", len(r_chain)) r_chain_burned = r_chain[burn::skip] # print("chain after burn and skip") # for r in r_chain_burned: # print(r) #input() worst_index = np.argmin(r_chain_burned[:, 1]) print(r_chain_burned[worst_index]) print(np.sum(r_chain_burned[:, 1] < -0.82), "out of", len(r_chain_burned)) #input() print("MAP policy") utils.print_policy_from_occupancies(map_u, mdp_env) #let's actually try using the optimal policy to get the feature counts and see if the regret method works? u_expert = u_sa alpha = 0.95 n = r_chain_burned.shape[0] posterior_probs = np.ones(n) / n #uniform dist since samples from MCMC cvar_opt_usa_regret, cvar, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, False) print("{}-CVaR policy regret optimal u_E".format(alpha)) utils.print_policy_from_occupancies(cvar_opt_usa_regret, mdp_env) cvar_2, exp_ret2 = mdp.solve_cvar_expret_fixed_policy( mdp_env, cvar_opt_usa_regret, u_expert,
weights = utils.sample_l2_ball(num_features) print("weights", weights) gamma = 0.99 #let's look at all starting states for now init_dist = np.ones(num_states) / num_states # init_states = [10] # for si in init_states: # init_dist[si] = 1.0 / len(init_states) #no terminal term_states = [] mdp_env = mdp.FeaturizedGridMDP(num_rows, num_cols, state_features, weights, gamma, init_dist, term_states) return mdp_env if __name__ == "__main__": #mdp_env = machine_teaching_toy_featurized() # mdp_env = lava_ambiguous_aaai18() mdp_env = random_gridworld_corner_terminal(6, 6, 5) print("features") utils.display_onehot_state_features(mdp_env) u_sa = mdp.solve_mdp_lp(mdp_env, debug=True) print("optimal policy") utils.print_policy_from_occupancies(u_sa, mdp_env) print("optimal values") v = mdp.get_state_values(u_sa, mdp_env) utils.print_as_grid(v, mdp_env)
birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False, mcmc_norm=mcmc_norm) map_w, map_u_sa, w_chain, u_chain = birl.sample_posterior( demonstrations, num_samples, False) print("Birl complete") if debug: print("-------") print("true weights", true_w) print("features") utils.display_onehot_state_features(mdp_env) print("optimal policy") utils.print_policy_from_occupancies(opt_u_sa, mdp_env) print("optimal values") v = mdp.get_state_values(opt_u_sa, mdp_env) utils.print_as_grid(v, mdp_env) if debug: print("map_weights", map_w) map_r = np.dot(mdp_env.state_features, map_w) print("MAP reward") utils.print_as_grid(map_r, mdp_env) print("Map policy") utils.print_policy_from_occupancies(map_u_sa, mdp_env) w_chain_burned = w_chain[burn::skip] ###compute mean reward policy
# mdp_env = mdp_worlds.machine_teaching_toy_featurized() # demonstrations = [(2,3),(5,0),(4,0),(3,2)] mdp_env = mdp_worlds.lava_ambiguous_aaai18() u_sa = mdp.solve_mdp_lp(mdp_env) #generate demo from state 5 to terminal demonstrations = utils.rollout_from_usa(5, 10, u_sa, mdp_env) print(demonstrations) beta = 100.0 step_stdev = 0.01 birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False) map_w, map_u, r_chain, u_chain = birl.sample_posterior(demonstrations, 10000) print("map_weights", map_w) map_r = np.dot(mdp_env.state_features, map_w) utils.print_as_grid(map_r, mdp_env) print("Map policy") utils.print_policy_from_occupancies(map_u, mdp_env) # print("chain") # for r in r_chain: # print(r) worst_index = np.argmin(r_chain[:,1]) print(r_chain[worst_index]) print(np.sum(r_chain[:,1] < -0.82))
###BIRL beta = 10.0 step_stdev = 0.2 burn = 50 skip = 5 num_samples = 200 mcmc_norm = "l2" likelihood = "birl" mdp_env = mdp_worlds.lava_ambiguous_corridor() opt_sa = mdp.solve_mdp_lp(mdp_env) print("Cliff world") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa, mdp_env) print("reward") utils.print_as_grid(mdp_env.r_s, mdp_env) print("features") utils.display_onehot_state_features(mdp_env) init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) s = init_demo_state #mdp_env.init_states[0] # only one initial state demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env) print("demo", d, demo) traj_demonstrations.append(demo)
###BIRL beta = 10.0 step_stdev = 0.2 burn = 500 skip = 5 num_samples = 2000 mcmc_norm = "l2" likelihood = "birl" mdp_env = mdp_worlds.lava_ambiguous_corridor() opt_sa = mdp.solve_mdp_lp(mdp_env) print("Cliff world") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa, mdp_env) print("reward") utils.print_as_grid(mdp_env.r_s, mdp_env) print("features") utils.display_onehot_state_features(mdp_env) init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) s = init_demo_state #mdp_env.init_states[0] # only one initial state demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env) print("demo", d, demo) traj_demonstrations.append(demo)
# mdp_env_B = mdp_worlds.lava_ambiguous_ird_fig2b() # u_sa_B = mdp.solve_mdp_lp(mdp_env_B) # print("mdp B") # print("Policy") # utils.print_policy_from_occupancies(u_sa_B, mdp_env_B) # print("reward") # utils.print_as_grid(mdp_env_B.r_s, mdp_env_B) #let's try out BIRL on a simpler version and see what happens #first let's give a demo in the A version that doesn't have lava mdp_env_B = mdp_worlds.lava_ird_simplified_b() map_w = np.array([-0.30380369, -0.9159926, 0.10477373, 0.24017357]) print("MAP") print("map_weights", map_w) map_r = np.dot(mdp_env_B.state_features, map_w) print("map reward") utils.print_as_grid(map_r, mdp_env_B) #compute new policy for mdp_B for map rewards map_r_sa = mdp_env_B.transform_to_R_sa(map_w) map_u_sa = mdp.solve_mdp_lp(mdp_env_B, reward_sa=map_r_sa) #use optional argument to replace standard rewards with sample print("Map policy") utils.print_policy_from_occupancies(map_u_sa, mdp_env_B)
def sample_posterior(self, demonstrations, num_samples, print_map_updates=False): #TODO: may require preprocessing of demos since this requires them to be a list of state-action pairs demos_sa = [] if type(demonstrations[0]) is tuple and len(demonstrations[0]) == 2: #each element in demonstrations is a state-action pair so no preprocessing needed demos_sa = demonstrations else: #assume we have a list of lists of state-action tuples for d in demonstrations: for s, a in d: demos_sa.append(s, a) #only save reward functions and occupancy_frequencies reward_samples = [] occupancy_frequencies = [] map_weights = None map_occupancy = None #find size of reward function #sample random reward hypothesis to start curr_weights = self.sample_init_reward() #print(curr_weights) curr_occupancies, curr_q_values = self.solve_optimal_policy( curr_weights) #compute log likelihood over demonstrations curr_ll = self.log_likelihood(curr_weights, curr_q_values, demos_sa) best_ll = -np.inf #run MCMC accept_cnt = 0 for step in range(num_samples): if self.debug: print("\n------\nstep", step) #compute proposal and log likelihood proposal_weights = self.generate_proposal_weights(curr_weights) #compute q_values and occupancy frequencies proposal_occupancies, proposal_q_values = self.solve_optimal_policy( proposal_weights) if self.debug: print("proposal reward", proposal_weights) if self.debug: print("proposal qvalues", proposal_q_values) if self.debug: utils.print_policy_from_occupancies(proposal_occupancies, self.mdp_env) prop_ll = self.log_likelihood(proposal_weights, proposal_q_values, demos_sa) if self.debug: print("prop_ll", prop_ll, "curr_ll", curr_ll) prob_accept = min(1.0, np.exp(prop_ll - curr_ll)) if self.debug: print("prob accept", prob_accept) rand_sample = np.random.rand() if self.debug: print("rand prob", rand_sample) if rand_sample < prob_accept: accept_cnt += 1 if self.debug: print("accept") #accept and add to chain reward_samples.append(proposal_weights) occupancy_frequencies.append(proposal_occupancies) curr_ll = prop_ll curr_weights = proposal_weights curr_occupancies = proposal_occupancies #update MAP if prop_ll > best_ll: if print_map_updates: print(step) if self.debug: utils.print_policy_from_occupancies( proposal_occupancies, self.mdp_env) if self.debug: print("Q(s,a)", proposal_q_values) best_ll = prop_ll map_weights = proposal_weights.copy() map_occupancy = proposal_occupancies.copy() if print_map_updates: print("w_map", map_weights, "loglik = {:.4f}".format(best_ll)) else: if self.debug: print("reject") reward_samples.append(curr_weights) occupancy_frequencies.append(curr_occupancies) #print out last reward sampled #print(reward_samples[-1]) print("w_map", map_weights, "loglik", best_ll) print("accepted/total = {}/{} = {}".format(accept_cnt, num_samples, accept_cnt / num_samples)) # if best_ll < -10: # input("Didn't seem to converge... Check likelihoods and demos... Continue?") return map_weights, map_occupancy, np.array(reward_samples), np.array( occupancy_frequencies)
np.random.seed(seed) random.seed(seed) #train mdp mdp_env_A = mdp_worlds.lavaland_smaller(contains_lava=False) #test mdp ((probably) has lava) mdp_env_B = mdp_worlds.lavaland_smaller(contains_lava=True) u_sa_A = mdp.solve_mdp_lp(mdp_env_A) print("===========================") print("Training MDP with No Lava") print("===========================") print("Optimal Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demonstration from top left corner traj_demonstrations = [] demo_set = set() for s in demo_states: #range(mdp_env_A.get_num_states()): if mdp_env_A.init_dist[s] > 0: demo = utils.rollout_from_usa(s, demo_horizon, u_sa_A, mdp_env_A) traj_demonstrations.append(demo) for s_a in demo: demo_set.add(s_a) demonstrations = list(demo_set)
num_rows = 10 num_cols = 10 num_features = 6 train_mdp = mdp_worlds.negative_sideeffects_goal(num_rows, num_cols, num_features, unseen_feature=False) train_mdp.set_reward_fn(np.array([-.1, -.6, -.1, -0.6, -2, 0])) opt_sa_train = mdp.solve_mdp_lp(train_mdp) print("===========================") print("Training MDP with No Lava") print("===========================") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa_train, train_mdp) print("reward") utils.print_as_grid(train_mdp.r_s, train_mdp) print("features") utils.display_onehot_state_features(train_mdp) import numpy as np np.random.randint(60) init_demo_states = [0, 9, 90, 99] #mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d)