def __init__(self,
                 world,
                 Q,
                 opt_policy,
                 epsilon=0.0001,
                 debug=False,
                 remove_redundancy_lp=True):
        self.world = world
        self.precision = epsilon
        self.debug = debug
        self.remove_redundancy_lp = remove_redundancy_lp
        #print("self.debug", debug)
        #solve MDP
        if self.debug:
            print("rewards")
            world.print_rewards()
        #V = mdp.value_iteration(world, epsilon=epsilon)
        self.Q = Q  #mdp.compute_q_values(world, V, eps=epsilon)
        if self.debug:
            V = mdp.value_iteration(world, epsilon=epsilon)
            print("values function")
            world.print_map(V)

        self.opt_policy = opt_policy  #mdp.find_optimal_policy(world, Q=self.Q, epsilon=epsilon)
        if self.debug:
            print("optimal policy")
            world.print_map(world.to_arrows(self.opt_policy))
        self.sa_fcounts = mdp.calculate_sa_expected_feature_counts(
            self.opt_policy, world, epsilon=epsilon)
def debug_mdp(world):
    print("rewards")
    world.print_rewards()

    import time

    print("values")
    t0 = time.time()
    V = mdp.value_iteration(world)
    t1 = time.time()
    world.print_map(V)
    print(t1 - t0)

    print("values inplace")
    t0 = time.time()
    V = mdp.value_iteration_inplace(world)
    t1 = time.time()
    world.print_map(V)
    print(t1 - t0)

    Q = mdp.compute_q_values(world, V)
    print("Q-values")
    print(Q)

    print("optimal policy")
    opt_policy = mdp.find_optimal_policy(world, Q=Q)
    print(opt_policy)
    print("optimal policy")
    world.print_map(world.to_arrows(opt_policy))
    state_features = mdp_grid
    terminals = mdp_gen.get_terminals_from_grid(term_grid)
    #print("state features\n",state_features)
    state_features = mdp_gen.categorical_to_one_hot_features(
        state_features, num_features)
    print('one hot features', state_features)

    world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials,
                                       terminals, gamma)
    mdp_family.append(world)

#plot for visualization
all_opts = []
all_features = []
for i, mdp_env in enumerate(mdp_family):
    V = mdp.value_iteration(mdp_env, epsilon=precision)
    Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision)
    opt_policy = mdp.find_optimal_policy(mdp_env, Q=Qopt, epsilon=precision)
    print(opt_policy)
    print(mdp_env.features)
    all_opts.append(opt_policy)
    all_features.append(mdp_env.features)
    #input()
filename = "./data_analysis/figs/twoXtwo/firstthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3],
                                      all_features[:3],
                                      1,
                                      3,
                                      filename=filename)
filename = "./data_analysis/figs/twoXtwo/lastthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:],
예제 #4
0
            terminals = []  #[(num_rows-1,num_cols-1)]
            gamma = 0.9
            seed = 1237  #init_seed + r_iter
            print("seed", seed)
            np.random.seed(seed)
            random.seed(seed)

            #First let's generate a random MDP
            state_features = eutils.create_random_features_row_col_m(
                num_rows, num_cols, num_features)
            #print("state features\n",state_features)
            true_weights = random_weights(num_features)
            true_world = mdp.LinearFeatureGridWorld(state_features,
                                                    true_weights, initials,
                                                    terminals, gamma)
            V = mdp.value_iteration(true_world, epsilon=precision)
            true_exp_return = np.mean([V[s] for s in true_world.initials])
            Qopt = mdp.compute_q_values(true_world, V=V, eps=precision)
            opt_policy = mdp.find_optimal_policy(true_world,
                                                 Q=Qopt,
                                                 epsilon=precision)

            if debug:
                print("true weights: ", true_weights)

                print("rewards")
                true_world.print_rewards()
                print("value function")

                true_world.print_map(V)
                print("mdp features")
from src.traj_pair import TrajPair
import src.grid_worlds as gw
import src.value_alignment_verification as vav
import src.alignment_heuristics as ah
import data_analysis.plot_grid as mdp_plot

seed = 1222
np.random.seed(seed)
import random
random.seed(seed)

world = gw.create_safety_lava_world_nowalls()

print("rewards")
world.print_rewards()
V = mdp.value_iteration(world)
Q = mdp.compute_q_values(world, V)
print("values")
world.print_map(V)

opt_policy = mdp.find_optimal_policy(world, Q=Q)
print("optimal policy")
world.print_map(world.to_arrows(opt_policy))
print(opt_policy)

lava_colors = [
    'black', 'tab:green', 'white', 'tab:red', 'tab:blue', 'tab:gray',
    'tab:green', 'tab:purple', 'tab:orange', 'tab:cyan'
]

mdp_plot.plot_optimal_policy_vav(opt_policy,
    def get_machine_teaching_mdps(self):

        constraint_set = self.family_halfspaces
        candidate_mdps = self.mdp_family
        candidate_halfspaces = self.mdp_halfspaces
        #create boolean bookkeeping to see what has been covered in the set
        covered = [False for _ in constraint_set]

        #for each candidate demonstration trajectory check how many uncovered set elements it covers and find one with max added covers
        total_covered = 0
        opt_mdps = []
        while total_covered < len(constraint_set):
            if self.debug: print("set cover iteration")
            constraints_to_add = None
            best_mdp = None
            max_count = 0
            for i, mdp_env in enumerate(candidate_mdps):
                # if self.debug:
                #     print("-"*20)
                #     print("MDP", i)

                #     V = mdp.value_iteration(mdp_env, epsilon=self.precision)
                #     Qopt = mdp.compute_q_values(mdp_env, V=V, eps=self.precision)
                #     opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=self.precision)
                #     print("rewards")
                #     mdp_env.print_rewards()
                #     print("value function")

                #     mdp_env.print_map(V)
                #     print("mdp features")
                #     utils.display_onehot_state_features(mdp_env)

                #     print("optimal policy")
                #     mdp_env.print_map(mdp_env.to_arrows(opt_policy))

                #     print("halfspace")
                #     print(candidate_halfspaces[i])
                #get the halfspaces induced by an optimal policy in this MDP
                constraints_new = candidate_halfspaces[i]

                count = self.count_new_covers(constraints_new, constraint_set,
                                              covered)
                #if self.debug: print("covered", count)
                if count > max_count:
                    max_count = count
                    constraints_to_add = constraints_new
                    best_mdp = mdp_env
                    if self.debug:
                        print()
                        print("best mdp so far")
                        print("-" * 20)
                        print("MDP", i)

                        V = mdp.value_iteration(mdp_env,
                                                epsilon=self.precision)
                        Qopt = mdp.compute_q_values(mdp_env,
                                                    V=V,
                                                    eps=self.precision)
                        opt_policy = mdp.find_optimal_policy(
                            mdp_env, Q=Qopt, epsilon=self.precision)
                        print("rewards")
                        mdp_env.print_rewards()
                        print("value function")

                        mdp_env.print_map(V)
                        print("mdp features")
                        utils.display_onehot_state_features(mdp_env)

                        print("optimal policy")
                        mdp_env.print_map(mdp_env.to_arrows(opt_policy))

                        print("halfspace")
                        print(constraints_to_add)

                        print("covered", count)

            #update covered flags and add best_traj to demo`
            opt_mdps.append(best_mdp)
            covered = self.update_covered_constraints(constraints_to_add,
                                                      constraint_set, covered)
            total_covered += max_count
            #TODO: optimize by removing trajs if we decide to add to opt_demos

        return opt_mdps