print("--"*10) state_features = mdp_grid terminals = mdp_gen.get_terminals_from_grid(term_grid) #print("state features\n",state_features) state_features = mdp_gen.categorical_to_one_hot_features(state_features, num_features) print('one hot features', state_features) world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) mdp_family.append(world) #plot for visualization all_opts = [] all_features = [] for i,mdp_env in enumerate(mdp_family): V = mdp.value_iteration(mdp_env, epsilon=precision) Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision) opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=precision) print(opt_policy) print(mdp_env.features) all_opts.append(opt_policy) all_features.append(mdp_env.features) #input() filename = "./data_analysis/figs/twoXtwo/firstthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3], all_features[:3], 1, 3, filename=filename) filename = "./data_analysis/figs/twoXtwo/lastthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:], all_features[-3:], 1, 3, filename=filename) #plt.show() family_teacher = machine_teaching.MdpFamilyTeacher(mdp_family, precision, debug) mdp_set_cover = family_teacher.get_machine_teaching_mdps()
state_features = eutils.create_random_features_row_col_m( num_rows, num_cols, num_features) #print("state features\n",state_features) true_weights = random_weights(num_features) print("true weights: ", true_weights) true_world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) print("rewards") true_world.print_rewards() print("value function") V = mdp.value_iteration(true_world) true_world.print_map(V) print("mdp features") utils.display_onehot_state_features(true_world) #find the optimal policy under this MDP Qopt = mdp.compute_q_values(true_world, V=V) opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt) print("optimal policy") true_world.print_map(true_world.to_arrows(opt_policy)) #input() #now find a bunch of other optimal policies for the same MDP but with different weight vectors. #TODO: I wonder if there is a better way to create these eval policies? # Can we efficiently solve for all of them or should they all be close? (e.g. rewards sampled from gaussian centerd on true reward?) world = copy.deepcopy(true_world) eval_policies = [] eval_Qvalues = [] eval_weights = [] num_eval_policies = 0 for i in range(num_eval_policies_tries): #print("trying", i) #change the reward weights
gamma = 0.9 seed = init_seed + r_iter print("seed", seed) np.random.seed(seed) random.seed(seed) #First let's generate a random MDP state_features = eutils.create_random_features_row_col_m( num_rows, num_cols, num_features) #print("state features\n",state_features) true_weights = random_weights(num_features) true_world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) V = mdp.value_iteration(true_world, epsilon=precision) Qopt = mdp.compute_q_values(true_world, V=V, eps=precision) opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt, epsilon=precision) if debug: print("true weights: ", true_weights) print("rewards") true_world.print_rewards() print("value function") true_world.print_map(V) print("mdp features") utils.display_onehot_state_features(true_world)