Exemplo n.º 1
0
    def solve_optimal_policy(self, reward_weights):
        #print(reward_weights)
        #returns occupancy_frequencies and q_values (vectorized |S|*|A| by q(s0,a0), q(s1,a0), ....)
        reward_sa = self.mdp_env.transform_to_R_sa(reward_weights)
        occupancy_frequencies = mdp.solve_mdp_lp(
            self.mdp_env, reward_sa=reward_sa
        )  #use optional argument to replace standard rewards with sample

        num_states, num_actions, gamma = self.mdp_env.num_states, self.mdp_env.num_actions, self.mdp_env.gamma
        stochastic_policy = utils.get_optimal_policy_from_usa(
            occupancy_frequencies, self.mdp_env)
        reward_policy = mdp.get_policy_rewards(stochastic_policy, reward_sa)
        transitions_policy = mdp.get_policy_transitions(
            stochastic_policy, self.mdp_env)
        A = np.eye(num_states) - gamma * transitions_policy
        b = reward_policy

        state_values = np.linalg.solve(A, b)
        Ps = tuple(self.mdp_env.Ps[i] for i in range(num_actions))
        P_column = np.concatenate(Ps, axis=0)
        #print(P_column)
        q_values = reward_sa + gamma * np.dot(P_column, state_values)
        #q_values = mdp.get_q_values(occupancy_frequencies, self.mdp_env)
        return occupancy_frequencies, q_values
Exemplo n.º 2
0
# print("Policy")
# utils.print_policy_from_occupancies(u_sa_B, mdp_env_B)
# print("reward")
# utils.print_as_grid(mdp_env_B.r_s, mdp_env_B)

seed = 12131
np.random.seed(seed)
random.seed(seed)

#let's try out BIRL on a simpler version and see what happens

#first let's give a demo in the A version that doesn't have lava

mdp_env_A = mdp_worlds.lava_ird_simplified_a()
# mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a()
u_sa_A = mdp.solve_mdp_lp(mdp_env_A)

print("mdp A")
print("Policy")
utils.print_policy_from_occupancies(u_sa_A, mdp_env_A)
print("reward")
utils.print_as_grid(mdp_env_A.r_s, mdp_env_A)
print("features")
utils.display_onehot_state_features(mdp_env_A)

#generate demo for Dylan's NeurIPS world
# demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A)
#generate demo for my simplified world

demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A)
print("demonstration")
import scipy
import random

if __name__ == "__main__":
    seed = 1234
    np.random.seed(seed)
    scipy.random.seed(seed)
    random.seed(seed)
    #mdp_env = mdp_worlds.two_state_chain()
    #demonstrations = [(1,0), (0,0)]

    # mdp_env = mdp_worlds.machine_teaching_toy_featurized()
    # demonstrations = [(2,3),(5,0),(4,0),(3,2)]

    mdp_env = mdp_worlds.lava_ambiguous_aaai18()
    u_sa = mdp.solve_mdp_lp(mdp_env)
    #generate demo from state 5 to terminal
    demonstrations = utils.rollout_from_usa(5, 10, u_sa, mdp_env)
    print(demonstrations)

    traj_demonstrations = [demonstrations]

    beta = 10.0
    step_stdev = 0.1
    birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False)

    num_samples = 200
    burn = 50
    skip = 2
    map_w, map_u, r_chain, u_chain = birl.sample_posterior(
        demonstrations, num_samples)
Exemplo n.º 4
0
    weights = utils.sample_l2_ball(num_features)

    print("weights", weights)
    gamma = 0.99
    #let's look at all starting states for now
    init_dist = np.ones(num_states) / num_states
    # init_states = [10]
    # for si in init_states:
    #     init_dist[si] = 1.0 / len(init_states)

    #no terminal
    term_states = []

    mdp_env = mdp.FeaturizedGridMDP(num_rows, num_cols, state_features,
                                    weights, gamma, init_dist, term_states)
    return mdp_env


if __name__ == "__main__":
    #mdp_env = machine_teaching_toy_featurized()
    # mdp_env = lava_ambiguous_aaai18()
    mdp_env = random_gridworld_corner_terminal(6, 6, 5)
    print("features")
    utils.display_onehot_state_features(mdp_env)
    u_sa = mdp.solve_mdp_lp(mdp_env, debug=True)
    print("optimal policy")
    utils.print_policy_from_occupancies(u_sa, mdp_env)
    print("optimal values")
    v = mdp.get_state_values(u_sa, mdp_env)
    utils.print_as_grid(v, mdp_env)
num_samples = 3000
burn = 300
skip = 5
mcmc_norm = "l2"

for i in range(num_trials):
    print("=" * 10)
    print("iteration", i)
    print("=" * 10)

    seed = init_seed + i * 13
    np.random.seed(seed)
    random.seed(seed)

    mdp_env = mdp_worlds.random_gridworld(num_rows, num_cols, num_features)
    opt_u_sa = mdp.solve_mdp_lp(mdp_env, debug=debug)
    true_r_sa = mdp_env.r_sa
    true_w = mdp_env.feature_weights

    #generate demontration from bottom left corner
    demonstrations = utils.rollout_from_usa(demo_state, horizon, opt_u_sa,
                                            mdp_env)
    print("demonstration")
    print(demonstrations)

    ###Run Bayesian IRL to get posterior
    print("running B-IRL")
    birl = bayesian_irl.BayesianIRL(mdp_env,
                                    beta,
                                    step_stdev,
                                    debug=False,
slip_prob = 0.3
demo_horizon = 10
num_demos = 1

###BIRL
beta = 10.0
step_stdev = 0.2
burn = 500
skip = 5
num_samples = 2000
mcmc_norm = "l2"
likelihood = "birl"

mdp_env = mdp_worlds.lava_ambiguous_corridor()
opt_sa = mdp.solve_mdp_lp(mdp_env)


print("Cliff world")
print("Optimal Policy")
utils.print_policy_from_occupancies(opt_sa, mdp_env)
print("reward")
utils.print_as_grid(mdp_env.r_s, mdp_env)
print("features")
utils.display_onehot_state_features(mdp_env)

init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1)
traj_demonstrations = []
demo_set = set()
for d in range(num_demos):
    # np.random.seed(init_seed + d)
Exemplo n.º 7
0
# mdp_env_B = mdp_worlds.lava_ambiguous_ird_fig2b()

# u_sa_B = mdp.solve_mdp_lp(mdp_env_B)

# print("mdp B")
# print("Policy")
# utils.print_policy_from_occupancies(u_sa_B, mdp_env_B)
# print("reward")
# utils.print_as_grid(mdp_env_B.r_s, mdp_env_B)


#let's try out BIRL on a simpler version and see what happens

#first let's give a demo in the A version that doesn't have lava

mdp_env_B = mdp_worlds.lava_ird_simplified_b()
map_w = np.array([-0.30380369, -0.9159926,   0.10477373,  0.24017357])

print("MAP")
print("map_weights", map_w)
map_r = np.dot(mdp_env_B.state_features, map_w)
print("map reward")
utils.print_as_grid(map_r, mdp_env_B)
#compute new policy for mdp_B for map rewards
map_r_sa = mdp_env_B.transform_to_R_sa(map_w)
map_u_sa = mdp.solve_mdp_lp(mdp_env_B, reward_sa=map_r_sa) #use optional argument to replace standard rewards with sample
print("Map policy")
utils.print_policy_from_occupancies(map_u_sa, mdp_env_B)

Exemplo n.º 8
0
np.random.seed(init_seed)
random.seed(init_seed)

demo_horizon = 100
num_demos = 1

num_rows = 10
num_cols = 10
num_features = 6

train_mdp = mdp_worlds.negative_sideeffects_goal(num_rows,
                                                 num_cols,
                                                 num_features,
                                                 unseen_feature=False)
train_mdp.set_reward_fn(np.array([-.1, -.6, -.1, -0.6, -2, 0]))
opt_sa_train = mdp.solve_mdp_lp(train_mdp)
print("===========================")
print("Training MDP with No Lava")
print("===========================")

print("Optimal Policy")
utils.print_policy_from_occupancies(opt_sa_train, train_mdp)
print("reward")
utils.print_as_grid(train_mdp.r_s, train_mdp)
print("features")
utils.display_onehot_state_features(train_mdp)

import numpy as np

np.random.randint(60)