def solve_optimal_policy(self, reward_weights): #print(reward_weights) #returns occupancy_frequencies and q_values (vectorized |S|*|A| by q(s0,a0), q(s1,a0), ....) reward_sa = self.mdp_env.transform_to_R_sa(reward_weights) occupancy_frequencies = mdp.solve_mdp_lp( self.mdp_env, reward_sa=reward_sa ) #use optional argument to replace standard rewards with sample num_states, num_actions, gamma = self.mdp_env.num_states, self.mdp_env.num_actions, self.mdp_env.gamma stochastic_policy = utils.get_optimal_policy_from_usa( occupancy_frequencies, self.mdp_env) reward_policy = mdp.get_policy_rewards(stochastic_policy, reward_sa) transitions_policy = mdp.get_policy_transitions( stochastic_policy, self.mdp_env) A = np.eye(num_states) - gamma * transitions_policy b = reward_policy state_values = np.linalg.solve(A, b) Ps = tuple(self.mdp_env.Ps[i] for i in range(num_actions)) P_column = np.concatenate(Ps, axis=0) #print(P_column) q_values = reward_sa + gamma * np.dot(P_column, state_values) #q_values = mdp.get_q_values(occupancy_frequencies, self.mdp_env) return occupancy_frequencies, q_values
# print("Policy") # utils.print_policy_from_occupancies(u_sa_B, mdp_env_B) # print("reward") # utils.print_as_grid(mdp_env_B.r_s, mdp_env_B) seed = 12131 np.random.seed(seed) random.seed(seed) #let's try out BIRL on a simpler version and see what happens #first let's give a demo in the A version that doesn't have lava mdp_env_A = mdp_worlds.lava_ird_simplified_a() # mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a() u_sa_A = mdp.solve_mdp_lp(mdp_env_A) print("mdp A") print("Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demo for Dylan's NeurIPS world # demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A) #generate demo for my simplified world demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A) print("demonstration")
import scipy import random if __name__ == "__main__": seed = 1234 np.random.seed(seed) scipy.random.seed(seed) random.seed(seed) #mdp_env = mdp_worlds.two_state_chain() #demonstrations = [(1,0), (0,0)] # mdp_env = mdp_worlds.machine_teaching_toy_featurized() # demonstrations = [(2,3),(5,0),(4,0),(3,2)] mdp_env = mdp_worlds.lava_ambiguous_aaai18() u_sa = mdp.solve_mdp_lp(mdp_env) #generate demo from state 5 to terminal demonstrations = utils.rollout_from_usa(5, 10, u_sa, mdp_env) print(demonstrations) traj_demonstrations = [demonstrations] beta = 10.0 step_stdev = 0.1 birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False) num_samples = 200 burn = 50 skip = 2 map_w, map_u, r_chain, u_chain = birl.sample_posterior( demonstrations, num_samples)
weights = utils.sample_l2_ball(num_features) print("weights", weights) gamma = 0.99 #let's look at all starting states for now init_dist = np.ones(num_states) / num_states # init_states = [10] # for si in init_states: # init_dist[si] = 1.0 / len(init_states) #no terminal term_states = [] mdp_env = mdp.FeaturizedGridMDP(num_rows, num_cols, state_features, weights, gamma, init_dist, term_states) return mdp_env if __name__ == "__main__": #mdp_env = machine_teaching_toy_featurized() # mdp_env = lava_ambiguous_aaai18() mdp_env = random_gridworld_corner_terminal(6, 6, 5) print("features") utils.display_onehot_state_features(mdp_env) u_sa = mdp.solve_mdp_lp(mdp_env, debug=True) print("optimal policy") utils.print_policy_from_occupancies(u_sa, mdp_env) print("optimal values") v = mdp.get_state_values(u_sa, mdp_env) utils.print_as_grid(v, mdp_env)
num_samples = 3000 burn = 300 skip = 5 mcmc_norm = "l2" for i in range(num_trials): print("=" * 10) print("iteration", i) print("=" * 10) seed = init_seed + i * 13 np.random.seed(seed) random.seed(seed) mdp_env = mdp_worlds.random_gridworld(num_rows, num_cols, num_features) opt_u_sa = mdp.solve_mdp_lp(mdp_env, debug=debug) true_r_sa = mdp_env.r_sa true_w = mdp_env.feature_weights #generate demontration from bottom left corner demonstrations = utils.rollout_from_usa(demo_state, horizon, opt_u_sa, mdp_env) print("demonstration") print(demonstrations) ###Run Bayesian IRL to get posterior print("running B-IRL") birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False,
slip_prob = 0.3 demo_horizon = 10 num_demos = 1 ###BIRL beta = 10.0 step_stdev = 0.2 burn = 500 skip = 5 num_samples = 2000 mcmc_norm = "l2" likelihood = "birl" mdp_env = mdp_worlds.lava_ambiguous_corridor() opt_sa = mdp.solve_mdp_lp(mdp_env) print("Cliff world") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa, mdp_env) print("reward") utils.print_as_grid(mdp_env.r_s, mdp_env) print("features") utils.display_onehot_state_features(mdp_env) init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d)
# mdp_env_B = mdp_worlds.lava_ambiguous_ird_fig2b() # u_sa_B = mdp.solve_mdp_lp(mdp_env_B) # print("mdp B") # print("Policy") # utils.print_policy_from_occupancies(u_sa_B, mdp_env_B) # print("reward") # utils.print_as_grid(mdp_env_B.r_s, mdp_env_B) #let's try out BIRL on a simpler version and see what happens #first let's give a demo in the A version that doesn't have lava mdp_env_B = mdp_worlds.lava_ird_simplified_b() map_w = np.array([-0.30380369, -0.9159926, 0.10477373, 0.24017357]) print("MAP") print("map_weights", map_w) map_r = np.dot(mdp_env_B.state_features, map_w) print("map reward") utils.print_as_grid(map_r, mdp_env_B) #compute new policy for mdp_B for map rewards map_r_sa = mdp_env_B.transform_to_R_sa(map_w) map_u_sa = mdp.solve_mdp_lp(mdp_env_B, reward_sa=map_r_sa) #use optional argument to replace standard rewards with sample print("Map policy") utils.print_policy_from_occupancies(map_u_sa, mdp_env_B)
np.random.seed(init_seed) random.seed(init_seed) demo_horizon = 100 num_demos = 1 num_rows = 10 num_cols = 10 num_features = 6 train_mdp = mdp_worlds.negative_sideeffects_goal(num_rows, num_cols, num_features, unseen_feature=False) train_mdp.set_reward_fn(np.array([-.1, -.6, -.1, -0.6, -2, 0])) opt_sa_train = mdp.solve_mdp_lp(train_mdp) print("===========================") print("Training MDP with No Lava") print("===========================") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa_train, train_mdp) print("reward") utils.print_as_grid(train_mdp.r_s, train_mdp) print("features") utils.display_onehot_state_features(train_mdp) import numpy as np np.random.randint(60)