def __init__(self, world, Q, opt_policy, epsilon=0.0001, debug=False, remove_redundancy_lp=True): self.world = world self.precision = epsilon self.debug = debug self.remove_redundancy_lp = remove_redundancy_lp #print("self.debug", debug) #solve MDP if self.debug: print("rewards") world.print_rewards() #V = mdp.value_iteration(world, epsilon=epsilon) self.Q = Q #mdp.compute_q_values(world, V, eps=epsilon) if self.debug: V = mdp.value_iteration(world, epsilon=epsilon) print("values function") world.print_map(V) self.opt_policy = opt_policy #mdp.find_optimal_policy(world, Q=self.Q, epsilon=epsilon) if self.debug: print("optimal policy") world.print_map(world.to_arrows(self.opt_policy)) self.sa_fcounts = mdp.calculate_sa_expected_feature_counts( self.opt_policy, world, epsilon=epsilon)
def debug_mdp(world): print("rewards") world.print_rewards() import time print("values") t0 = time.time() V = mdp.value_iteration(world) t1 = time.time() world.print_map(V) print(t1 - t0) print("values inplace") t0 = time.time() V = mdp.value_iteration_inplace(world) t1 = time.time() world.print_map(V) print(t1 - t0) Q = mdp.compute_q_values(world, V) print("Q-values") print(Q) print("optimal policy") opt_policy = mdp.find_optimal_policy(world, Q=Q) print(opt_policy) print("optimal policy") world.print_map(world.to_arrows(opt_policy))
state_features = mdp_grid terminals = mdp_gen.get_terminals_from_grid(term_grid) #print("state features\n",state_features) state_features = mdp_gen.categorical_to_one_hot_features( state_features, num_features) print('one hot features', state_features) world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) mdp_family.append(world) #plot for visualization all_opts = [] all_features = [] for i, mdp_env in enumerate(mdp_family): V = mdp.value_iteration(mdp_env, epsilon=precision) Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision) opt_policy = mdp.find_optimal_policy(mdp_env, Q=Qopt, epsilon=precision) print(opt_policy) print(mdp_env.features) all_opts.append(opt_policy) all_features.append(mdp_env.features) #input() filename = "./data_analysis/figs/twoXtwo/firstthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3], all_features[:3], 1, 3, filename=filename) filename = "./data_analysis/figs/twoXtwo/lastthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:],
terminals = [] #[(num_rows-1,num_cols-1)] gamma = 0.9 seed = 1237 #init_seed + r_iter print("seed", seed) np.random.seed(seed) random.seed(seed) #First let's generate a random MDP state_features = eutils.create_random_features_row_col_m( num_rows, num_cols, num_features) #print("state features\n",state_features) true_weights = random_weights(num_features) true_world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) V = mdp.value_iteration(true_world, epsilon=precision) true_exp_return = np.mean([V[s] for s in true_world.initials]) Qopt = mdp.compute_q_values(true_world, V=V, eps=precision) opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt, epsilon=precision) if debug: print("true weights: ", true_weights) print("rewards") true_world.print_rewards() print("value function") true_world.print_map(V) print("mdp features")
from src.traj_pair import TrajPair import src.grid_worlds as gw import src.value_alignment_verification as vav import src.alignment_heuristics as ah import data_analysis.plot_grid as mdp_plot seed = 1222 np.random.seed(seed) import random random.seed(seed) world = gw.create_safety_lava_world_nowalls() print("rewards") world.print_rewards() V = mdp.value_iteration(world) Q = mdp.compute_q_values(world, V) print("values") world.print_map(V) opt_policy = mdp.find_optimal_policy(world, Q=Q) print("optimal policy") world.print_map(world.to_arrows(opt_policy)) print(opt_policy) lava_colors = [ 'black', 'tab:green', 'white', 'tab:red', 'tab:blue', 'tab:gray', 'tab:green', 'tab:purple', 'tab:orange', 'tab:cyan' ] mdp_plot.plot_optimal_policy_vav(opt_policy,
def get_machine_teaching_mdps(self): constraint_set = self.family_halfspaces candidate_mdps = self.mdp_family candidate_halfspaces = self.mdp_halfspaces #create boolean bookkeeping to see what has been covered in the set covered = [False for _ in constraint_set] #for each candidate demonstration trajectory check how many uncovered set elements it covers and find one with max added covers total_covered = 0 opt_mdps = [] while total_covered < len(constraint_set): if self.debug: print("set cover iteration") constraints_to_add = None best_mdp = None max_count = 0 for i, mdp_env in enumerate(candidate_mdps): # if self.debug: # print("-"*20) # print("MDP", i) # V = mdp.value_iteration(mdp_env, epsilon=self.precision) # Qopt = mdp.compute_q_values(mdp_env, V=V, eps=self.precision) # opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=self.precision) # print("rewards") # mdp_env.print_rewards() # print("value function") # mdp_env.print_map(V) # print("mdp features") # utils.display_onehot_state_features(mdp_env) # print("optimal policy") # mdp_env.print_map(mdp_env.to_arrows(opt_policy)) # print("halfspace") # print(candidate_halfspaces[i]) #get the halfspaces induced by an optimal policy in this MDP constraints_new = candidate_halfspaces[i] count = self.count_new_covers(constraints_new, constraint_set, covered) #if self.debug: print("covered", count) if count > max_count: max_count = count constraints_to_add = constraints_new best_mdp = mdp_env if self.debug: print() print("best mdp so far") print("-" * 20) print("MDP", i) V = mdp.value_iteration(mdp_env, epsilon=self.precision) Qopt = mdp.compute_q_values(mdp_env, V=V, eps=self.precision) opt_policy = mdp.find_optimal_policy( mdp_env, Q=Qopt, epsilon=self.precision) print("rewards") mdp_env.print_rewards() print("value function") mdp_env.print_map(V) print("mdp features") utils.display_onehot_state_features(mdp_env) print("optimal policy") mdp_env.print_map(mdp_env.to_arrows(opt_policy)) print("halfspace") print(constraints_to_add) print("covered", count) #update covered flags and add best_traj to demo` opt_mdps.append(best_mdp) covered = self.update_covered_constraints(constraints_to_add, constraint_set, covered) total_covered += max_count #TODO: optimize by removing trajs if we decide to add to opt_demos return opt_mdps