#First let's generate a random MDP state_features = eutils.create_random_features_row_col_m( num_rows, num_cols, num_features) #print("state features\n",state_features) true_weights = random_weights(num_features) print("true weights: ", true_weights) true_world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) print("rewards") true_world.print_rewards() print("value function") V = mdp.value_iteration(true_world) true_world.print_map(V) print("mdp features") utils.display_onehot_state_features(true_world) #find the optimal policy under this MDP Qopt = mdp.compute_q_values(true_world, V=V) opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt) print("optimal policy") true_world.print_map(true_world.to_arrows(opt_policy)) #input() #now find a bunch of other optimal policies for the same MDP but with different weight vectors. #TODO: I wonder if there is a better way to create these eval policies? # Can we efficiently solve for all of them or should they all be close? (e.g. rewards sampled from gaussian centerd on true reward?) world = copy.deepcopy(true_world) eval_policies = [] eval_Qvalues = [] eval_weights = [] num_eval_policies = 0 for i in range(num_eval_policies_tries):
#let's try out BIRL on a simpler version and see what happens #first let's give a demo in the A version that doesn't have lava mdp_env_A = mdp_worlds.lava_ird_simplified_a() # mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a() u_sa_A = mdp.solve_mdp_lp(mdp_env_A) print("mdp A") print("Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demo for Dylan's NeurIPS world # demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A) #generate demo for my simplified world demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A) print("demonstration") print(demonstrations) #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens. beta = 100.0 step_stdev = 0.1 burn = 100 skip = 5
train_mdp = mdp_worlds.negative_sideeffects_goal(num_rows, num_cols, num_features, unseen_feature=False) train_mdp.set_reward_fn(np.array([-.1, -.6, -.1, -0.6, -2, 0])) opt_sa_train = mdp.solve_mdp_lp(train_mdp) print("===========================") print("Training MDP with No Lava") print("===========================") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa_train, train_mdp) print("reward") utils.print_as_grid(train_mdp.r_s, train_mdp) print("features") utils.display_onehot_state_features(train_mdp) import numpy as np np.random.randint(60) init_demo_states = [0, 9, 90, 99] #mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) for s in init_demo_states: #s = init_demo_state #mdp_env.init_states[0] # only one initial state demo = utils.rollout_from_usa(s, demo_horizon, opt_sa_train, train_mdp) print("demo", d, demo)