Python compute_q_values示例，src.mdp.compute_q_values Python示例

示例#1

0

显示文件

文件： mdp_debug.py 项目： dsbrown1331/value_alignment_verification

def debug_mdp(world):
    print("rewards")
    world.print_rewards()

    import time

    print("values")
    t0 = time.time()
    V = mdp.value_iteration(world)
    t1 = time.time()
    world.print_map(V)
    print(t1 - t0)

    print("values inplace")
    t0 = time.time()
    V = mdp.value_iteration_inplace(world)
    t1 = time.time()
    world.print_map(V)
    print(t1 - t0)

    Q = mdp.compute_q_values(world, V)
    print("Q-values")
    print(Q)

    print("optimal policy")
    opt_policy = mdp.find_optimal_policy(world, Q=Q)
    print(opt_policy)
    print("optimal policy")
    world.print_map(world.to_arrows(opt_policy))

示例#2

0

显示文件

    def __init__(self,
                 mdp_world,
                 critical_threshold,
                 precision=0.0001,
                 debug=False):
        self.mdp_world = mdp_world
        self.entropy_threshold = critical_threshold
        self.precision = precision
        self.debug = debug
        self.q_values = mdp.compute_q_values(mdp_world)
        self.optimal_policy = mdp.find_optimal_policy(mdp_world,
                                                      Q=self.q_values)

        #find critical states
        if debug:
            print("finding critical states")
        self.critical_state_actions = []
        for s in self.mdp_world.states:
            if debug:
                print(s)
            #calculate entropy of optimal policy (assumes it is stochastic optimal)
            num_optimal_actions = len(self.optimal_policy[s])
            action_probs = np.zeros(len(self.mdp_world.actions(s)))
            for i in range(num_optimal_actions):
                action_probs[i] = 1.0 / num_optimal_actions
            entropy = utils.entropy(action_probs)
            if debug:
                print(s, entropy)
            best_action = utils.argmax(self.mdp_world.actions(s),
                                       lambda a: self.q_values[s, a])
            if entropy < self.entropy_threshold:
                self.critical_state_actions.append((s, best_action))

示例#3

0

显示文件

文件： value_alignment_verification_gym.py 项目： dsbrown1331/value_alignment_verification

    def __init__(self, mdp_world, precision, debug=False, remove_redundancy_lp = True):
        self.mdp_world = mdp_world
        self.precision = precision
        self.debug = debug
        self.q_values = mdp.compute_q_values(mdp_world, eps = precision)
        self.optimal_policy = mdp.find_optimal_policy(mdp_world, Q=self.q_values, epsilon=precision)

        teacher = machine_teaching.StateActionRankingTeacher(mdp_world, debug=self.debug, remove_redundancy_lp = remove_redundancy_lp, epsilon=precision)

        tests, _ = teacher.get_optimal_value_alignment_tests(use_suboptimal_rankings = False)

        #for now let's just select the first question for each halfspace
        self.test = [questions[0] for questions in tests]

示例#4

0

显示文件

文件： mdp_debug.py 项目： dsbrown1331/value_alignment_verification

def debug_demonstrations():

    world = create_random_10x10_3feature()

    print("rewards")
    world.print_rewards()

    import time

    print("features")
    utils.display_onehot_state_features(world)

    Q = mdp.compute_q_values(world)
    #print("Q-values")
    #print(Q)

    print("optimal policy")
    opt_policy = mdp.find_optimal_policy(world, Q=Q)
    #print(opt_policy)
    print("optimal policy")
    world.print_map(world.to_arrows(opt_policy))

    print(world.terminals)
    print("demo 1")
    demoA = utils.optimal_rollout_from_Qvals((1, 1), 3, Q, world, 0.0001)
    for (s, a) in demoA:
        print("({},{})".format(s, world.to_arrow(a)))
    print(mdp.calculate_trajectory_feature_counts(demoA, world))

    print()
    print("demo 2")
    demoB = utils.sa_optimal_rollout_from_Qvals((1, 1), (0, 1), 3, Q, world,
                                                0.0001)
    for (s, a) in demoB:
        print("({},{})".format(s, world.to_arrow(a)))
    print(mdp.calculate_trajectory_feature_counts(demoB, world))

    tpair = TrajPair(demoA, demoB, world, 0.0001)
    print(world.weights)

示例#5

0

显示文件

文件： mdp_family_machine_testing_exp2x2.py 项目： dsbrown1331/value_alignment_verification

    terminals = mdp_gen.get_terminals_from_grid(term_grid)
    #print("state features\n",state_features)
    state_features = mdp_gen.categorical_to_one_hot_features(
        state_features, num_features)
    print('one hot features', state_features)

    world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials,
                                       terminals, gamma)
    mdp_family.append(world)

#plot for visualization
all_opts = []
all_features = []
for i, mdp_env in enumerate(mdp_family):
    V = mdp.value_iteration(mdp_env, epsilon=precision)
    Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision)
    opt_policy = mdp.find_optimal_policy(mdp_env, Q=Qopt, epsilon=precision)
    print(opt_policy)
    print(mdp_env.features)
    all_opts.append(opt_policy)
    all_features.append(mdp_env.features)
    #input()
filename = "./data_analysis/figs/twoXtwo/firstthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3],
                                      all_features[:3],
                                      1,
                                      3,
                                      filename=filename)
filename = "./data_analysis/figs/twoXtwo/lastthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:],
                                      all_features[-3:],

示例#6

0

显示文件

            seed = 1237  #init_seed + r_iter
            print("seed", seed)
            np.random.seed(seed)
            random.seed(seed)

            #First let's generate a random MDP
            state_features = eutils.create_random_features_row_col_m(
                num_rows, num_cols, num_features)
            #print("state features\n",state_features)
            true_weights = random_weights(num_features)
            true_world = mdp.LinearFeatureGridWorld(state_features,
                                                    true_weights, initials,
                                                    terminals, gamma)
            V = mdp.value_iteration(true_world, epsilon=precision)
            true_exp_return = np.mean([V[s] for s in true_world.initials])
            Qopt = mdp.compute_q_values(true_world, V=V, eps=precision)
            opt_policy = mdp.find_optimal_policy(true_world,
                                                 Q=Qopt,
                                                 epsilon=precision)

            if debug:
                print("true weights: ", true_weights)

                print("rewards")
                true_world.print_rewards()
                print("value function")

                true_world.print_map(V)
                print("mdp features")
                utils.display_onehot_state_features(true_world)

示例#7

0

显示文件

文件： lava_land_debug.py 项目： dsbrown1331/value_alignment_verification

import src.grid_worlds as gw
import src.value_alignment_verification as vav
import src.alignment_heuristics as ah
import data_analysis.plot_grid as mdp_plot

seed = 1222
np.random.seed(seed)
import random
random.seed(seed)

world = gw.create_safety_lava_world_nowalls()

print("rewards")
world.print_rewards()
V = mdp.value_iteration(world)
Q = mdp.compute_q_values(world, V)
print("values")
world.print_map(V)

opt_policy = mdp.find_optimal_policy(world, Q=Q)
print("optimal policy")
world.print_map(world.to_arrows(opt_policy))
print(opt_policy)

lava_colors = [
    'black', 'tab:green', 'white', 'tab:red', 'tab:blue', 'tab:gray',
    'tab:green', 'tab:purple', 'tab:orange', 'tab:cyan'
]

mdp_plot.plot_optimal_policy_vav(opt_policy,
                                 world.features,

示例#8

0

显示文件

文件： machine_teaching.py 项目： dsbrown1331/value_alignment_verification

    def get_machine_teaching_mdps(self):

        constraint_set = self.family_halfspaces
        candidate_mdps = self.mdp_family
        candidate_halfspaces = self.mdp_halfspaces
        #create boolean bookkeeping to see what has been covered in the set
        covered = [False for _ in constraint_set]

        #for each candidate demonstration trajectory check how many uncovered set elements it covers and find one with max added covers
        total_covered = 0
        opt_mdps = []
        while total_covered < len(constraint_set):
            if self.debug: print("set cover iteration")
            constraints_to_add = None
            best_mdp = None
            max_count = 0
            for i, mdp_env in enumerate(candidate_mdps):
                # if self.debug:
                #     print("-"*20)
                #     print("MDP", i)

                #     V = mdp.value_iteration(mdp_env, epsilon=self.precision)
                #     Qopt = mdp.compute_q_values(mdp_env, V=V, eps=self.precision)
                #     opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=self.precision)
                #     print("rewards")
                #     mdp_env.print_rewards()
                #     print("value function")

                #     mdp_env.print_map(V)
                #     print("mdp features")
                #     utils.display_onehot_state_features(mdp_env)

                #     print("optimal policy")
                #     mdp_env.print_map(mdp_env.to_arrows(opt_policy))

                #     print("halfspace")
                #     print(candidate_halfspaces[i])
                #get the halfspaces induced by an optimal policy in this MDP
                constraints_new = candidate_halfspaces[i]

                count = self.count_new_covers(constraints_new, constraint_set,
                                              covered)
                #if self.debug: print("covered", count)
                if count > max_count:
                    max_count = count
                    constraints_to_add = constraints_new
                    best_mdp = mdp_env
                    if self.debug:
                        print()
                        print("best mdp so far")
                        print("-" * 20)
                        print("MDP", i)

                        V = mdp.value_iteration(mdp_env,
                                                epsilon=self.precision)
                        Qopt = mdp.compute_q_values(mdp_env,
                                                    V=V,
                                                    eps=self.precision)
                        opt_policy = mdp.find_optimal_policy(
                            mdp_env, Q=Qopt, epsilon=self.precision)
                        print("rewards")
                        mdp_env.print_rewards()
                        print("value function")

                        mdp_env.print_map(V)
                        print("mdp features")
                        utils.display_onehot_state_features(mdp_env)

                        print("optimal policy")
                        mdp_env.print_map(mdp_env.to_arrows(opt_policy))

                        print("halfspace")
                        print(constraints_to_add)

                        print("covered", count)

            #update covered flags and add best_traj to demo`
            opt_mdps.append(best_mdp)
            covered = self.update_covered_constraints(constraints_to_add,
                                                      constraint_set, covered)
            total_covered += max_count
            #TODO: optimize by removing trajs if we decide to add to opt_demos

        return opt_mdps

示例#9

0

显示文件

precision = 0.00001

eval_policies = []
eval_Qvalues = []
eval_weights = []
eval_halfspaces = []
all_halfspaces = []
num_eval_policies = 0
for i in range(num_eval_policies_tries):
    rand_world = copy.deepcopy(world)
    #print("trying", i)
    #change the reward weights
    eval_weight_vector = random_weights(num_features)
    rand_world.weights = eval_weight_vector
    #find the optimal policy under this MDP
    Qval = mdp.compute_q_values(rand_world, eps=precision)
    eval_policy = mdp.find_optimal_policy(rand_world,
                                          Q=Qval,
                                          epsilon=precision)
    #only save if not equal to optimal policy
    if eval_policy not in eval_policies:
        if debug:
            print("found distinct eval policy")
            print("weights", eval_weight_vector)

            rand_world.print_map(rand_world.to_arrows(eval_policy))

        eval_policies.append(eval_policy)
        eval_Qvalues.append(Qval)
        eval_weights.append(eval_weight_vector)
        teacher = machine_teaching.StateActionRankingTeacher(rand_world,