if __name__ == "__main__": seed = 1234 np.random.seed(seed) scipy.random.seed(seed) random.seed(seed) #mdp_env = mdp_worlds.two_state_chain() #demonstrations = [(1,0), (0,0)] # mdp_env = mdp_worlds.machine_teaching_toy_featurized() # demonstrations = [(2,3),(5,0),(4,0),(3,2)] mdp_env = mdp_worlds.lava_ambiguous_aaai18() u_sa = mdp.solve_mdp_lp(mdp_env) #generate demo from state 5 to terminal demonstrations = utils.rollout_from_usa(5, 10, u_sa, mdp_env) print(demonstrations) traj_demonstrations = [demonstrations] beta = 10.0 step_stdev = 0.1 birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False) num_samples = 200 burn = 50 skip = 2 map_w, map_u, r_chain, u_chain = birl.sample_posterior( demonstrations, num_samples) print("map_weights", map_w) map_r = np.dot(mdp_env.state_features, map_w)
# mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a() u_sa_A = mdp.solve_mdp_lp(mdp_env_A) print("mdp A") print("Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demo for Dylan's NeurIPS world # demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A) #generate demo for my simplified world demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A) print("demonstration") print(demonstrations) #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens. beta = 100.0 step_stdev = 0.1 burn = 100 skip = 5 num_samples = 2000 sample_norm = None birl = bayesian_irl.BayesianIRL(mdp_env_A, beta, step_stdev, debug=False,
for i in range(num_trials): print("=" * 10) print("iteration", i) print("=" * 10) seed = init_seed + i * 13 np.random.seed(seed) random.seed(seed) mdp_env = mdp_worlds.random_gridworld(num_rows, num_cols, num_features) opt_u_sa = mdp.solve_mdp_lp(mdp_env, debug=debug) true_r_sa = mdp_env.r_sa true_w = mdp_env.feature_weights #generate demontration from bottom left corner demonstrations = utils.rollout_from_usa(demo_state, horizon, opt_u_sa, mdp_env) print("demonstration") print(demonstrations) ###Run Bayesian IRL to get posterior print("running B-IRL") birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False, mcmc_norm=mcmc_norm) map_w, map_u_sa, w_chain, u_chain = birl.sample_posterior( demonstrations, num_samples, False) print("Birl complete") if debug:
print("Cliff world") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa, mdp_env) print("reward") utils.print_as_grid(mdp_env.r_s, mdp_env) print("features") utils.display_onehot_state_features(mdp_env) init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) s = init_demo_state #mdp_env.init_states[0] # only one initial state demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env) print("demo", d, demo) traj_demonstrations.append(demo) for s_a in demo: demo_set.add(s_a) demonstrations = list(demo_set) print("demonstration") print(demonstrations) state_feature_list = [tuple(fs) for fs in mdp_env.state_features] pg.get_policy_string_from_trajectory(traj_demonstrations[0], state_feature_list, mdp_env) # In[4]:
print("Training MDP with No Lava") print("===========================") print("Optimal Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demonstration from top left corner traj_demonstrations = [] demo_set = set() for s in demo_states: #range(mdp_env_A.get_num_states()): if mdp_env_A.init_dist[s] > 0: demo = utils.rollout_from_usa(s, demo_horizon, u_sa_A, mdp_env_A) traj_demonstrations.append(demo) for s_a in demo: demo_set.add(s_a) demonstrations = list(demo_set) print("demonstration") print(demonstrations) #CVaR stuff needs expected feature counts from a list of trajectories #traj_demonstrations = [demonstrations] #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens. birl = bayesian_irl.BayesianIRL(mdp_env_A, beta, step_stdev, debug=False,
print("Training MDP with No Lava") print("===========================") print("Optimal Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demonstration from top left corner traj_demonstrations = [] demo_set = set() for s in demo_states: #range(mdp_env_A.get_num_states()): if mdp_env_A.init_dist[s] > 0: demo = utils.rollout_from_usa(s, 100, u_sa_A, mdp_env_A) traj_demonstrations.append(demo) for s_a in demo: demo_set.add(s_a) demonstrations = list(demo_set) print("demonstration") print(demonstrations) #CVaR stuff needs expected feature counts from a list of trajectories #traj_demonstrations = [demonstrations] #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens. birl = bayesian_irl.BayesianIRL(mdp_env_A, beta, step_stdev, debug=False) map_w, map_u, r_chain, u_chain = birl.sample_posterior( demonstrations, num_samples, False)
print("features") utils.display_onehot_state_features(train_mdp) import numpy as np np.random.randint(60) init_demo_states = [0, 9, 90, 99] #mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) for s in init_demo_states: #s = init_demo_state #mdp_env.init_states[0] # only one initial state demo = utils.rollout_from_usa(s, demo_horizon, opt_sa_train, train_mdp) print("demo", d, demo) traj_demonstrations.append(demo) for s_a in demo: demo_set.add(s_a) demonstrations = list(demo_set) print("demonstration") print(demonstrations) state_feature_list = [tuple(fs) for fs in train_mdp.state_features] #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens. beta = 10.0 step_stdev = 0.05 num_samples = 1000