def calc_frontier(mdp_env,
                  u_expert,
                  reward_posterior,
                  posterior_probs,
                  lambda_range,
                  alpha,
                  debug=False):
    '''takes an MDP and runs over a range of lambdas to output the expected value and CVaR of the resulting solutions to the LP
        mdp_env: the mdp to run on
        u_expert: the baseline expert to try and beat (set to zeros just to be robust)
        reward_posterior: the reward posterior from B-IRL(already burned and skiped and ready to run in LP)
        posterior_probs: the probabilities of each element in the posterior (uniform if from MCMC)
        lambda_range: a list of lambda values to try
        alpha: the CVaR alpha (risk sensitivity) higher is more risk-sensitive/conservative
    '''

    cvar_exprews = []

    for lamda in lambda_range:
        cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(
            mdp_env, u_expert, reward_posterior, posterior_probs, alpha, debug,
            lamda)

        print("Policy for lambda={} and alpha={}".format(lamda, alpha))
        utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env)
        print("stochastic policy")
        utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env)
        print("CVaR of policy = {}".format(cvar_value))
        print("Expected return of policy = {}".format(exp_ret))
        cvar_exprews.append((cvar_value, exp_ret))
    return cvar_exprews
Exemplo n.º 2
0
def calc_max_ent_u_sa(mdp_env,
                      demos,
                      max_epochs=1000,
                      horizon=None,
                      learning_rate=0.01):
    import mdp
    import utils
    seed_weights = np.zeros(mdp_env.get_reward_dimensionality())

    # Parameters
    if horizon is None:
        horizon = mdp_env.num_states

    # Main algorithm call
    r_weights, grads, state_features, maxent_pi = maxEntIRL(mdp_env,
                                                            demos,
                                                            seed_weights,
                                                            max_epochs,
                                                            horizon,
                                                            learning_rate,
                                                            norm="l2")

    # Construct reward function from weights and state features
    reward_fxn = []
    for s_i in range(mdp_env.num_states):
        reward_fxn.append(np.dot(r_weights, state_features[s_i]))
    reward_fxn = np.reshape(reward_fxn, (mdp_env.num_rows, mdp_env.num_cols))
    print("learned reward function")
    print(reward_fxn)

    u_s = mdp.get_policy_state_occupancy_frequencies(maxent_pi, mdp_env)
    u_sa = mdp.stoch_policy_to_usa(maxent_pi, mdp_env)
    utils.print_policy_from_occupancies(u_sa, mdp_env)
    utils.print_stochastic_policy_action_probs(u_sa, mdp_env)

    return u_sa, r_weights, maxent_pi
Exemplo n.º 3
0
seed = 12131
np.random.seed(seed)
random.seed(seed)

#let's try out BIRL on a simpler version and see what happens

#first let's give a demo in the A version that doesn't have lava

mdp_env_A = mdp_worlds.lava_ird_simplified_a()
# mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a()
u_sa_A = mdp.solve_mdp_lp(mdp_env_A)

print("mdp A")
print("Policy")
utils.print_policy_from_occupancies(u_sa_A, mdp_env_A)
print("reward")
utils.print_as_grid(mdp_env_A.r_s, mdp_env_A)
print("features")
utils.display_onehot_state_features(mdp_env_A)

#generate demo for Dylan's NeurIPS world
# demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A)
#generate demo for my simplified world

demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A)
print("demonstration")
print(demonstrations)

#Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens.
    worst_index = np.argmin(r_chain[:, 1])
    print(r_chain[worst_index])
    print(np.sum(r_chain[:, 1] < -0.82), "out of ", len(r_chain))

    r_chain_burned = r_chain[burn::skip]
    # print("chain after burn and skip")
    # for r in r_chain_burned:
    #     print(r)
    #input()
    worst_index = np.argmin(r_chain_burned[:, 1])
    print(r_chain_burned[worst_index])
    print(np.sum(r_chain_burned[:, 1] < -0.82), "out of", len(r_chain_burned))
    #input()

    print("MAP policy")
    utils.print_policy_from_occupancies(map_u, mdp_env)

    #let's actually try using the optimal policy to get the feature counts and see if the regret method works?
    u_expert = u_sa
    alpha = 0.95
    n = r_chain_burned.shape[0]
    posterior_probs = np.ones(n) / n  #uniform dist since samples from MCMC
    cvar_opt_usa_regret, cvar, exp_ret = mdp.solve_max_cvar_policy(
        mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha,
        False)
    print("{}-CVaR policy regret optimal u_E".format(alpha))
    utils.print_policy_from_occupancies(cvar_opt_usa_regret, mdp_env)
    cvar_2, exp_ret2 = mdp.solve_cvar_expret_fixed_policy(
        mdp_env,
        cvar_opt_usa_regret,
        u_expert,
Exemplo n.º 5
0
    weights = utils.sample_l2_ball(num_features)

    print("weights", weights)
    gamma = 0.99
    #let's look at all starting states for now
    init_dist = np.ones(num_states) / num_states
    # init_states = [10]
    # for si in init_states:
    #     init_dist[si] = 1.0 / len(init_states)

    #no terminal
    term_states = []

    mdp_env = mdp.FeaturizedGridMDP(num_rows, num_cols, state_features,
                                    weights, gamma, init_dist, term_states)
    return mdp_env


if __name__ == "__main__":
    #mdp_env = machine_teaching_toy_featurized()
    # mdp_env = lava_ambiguous_aaai18()
    mdp_env = random_gridworld_corner_terminal(6, 6, 5)
    print("features")
    utils.display_onehot_state_features(mdp_env)
    u_sa = mdp.solve_mdp_lp(mdp_env, debug=True)
    print("optimal policy")
    utils.print_policy_from_occupancies(u_sa, mdp_env)
    print("optimal values")
    v = mdp.get_state_values(u_sa, mdp_env)
    utils.print_as_grid(v, mdp_env)
    birl = bayesian_irl.BayesianIRL(mdp_env,
                                    beta,
                                    step_stdev,
                                    debug=False,
                                    mcmc_norm=mcmc_norm)
    map_w, map_u_sa, w_chain, u_chain = birl.sample_posterior(
        demonstrations, num_samples, False)
    print("Birl complete")

    if debug:
        print("-------")
        print("true weights", true_w)
        print("features")
        utils.display_onehot_state_features(mdp_env)
        print("optimal policy")
        utils.print_policy_from_occupancies(opt_u_sa, mdp_env)
        print("optimal values")
        v = mdp.get_state_values(opt_u_sa, mdp_env)
        utils.print_as_grid(v, mdp_env)

    if debug:
        print("map_weights", map_w)
        map_r = np.dot(mdp_env.state_features, map_w)
        print("MAP reward")
        utils.print_as_grid(map_r, mdp_env)
        print("Map policy")
        utils.print_policy_from_occupancies(map_u_sa, mdp_env)

    w_chain_burned = w_chain[burn::skip]

    ###compute mean reward policy
Exemplo n.º 7
0
    # mdp_env = mdp_worlds.machine_teaching_toy_featurized()
    # demonstrations = [(2,3),(5,0),(4,0),(3,2)]


    mdp_env = mdp_worlds.lava_ambiguous_aaai18()
    u_sa = mdp.solve_mdp_lp(mdp_env)
    #generate demo from state 5 to terminal
    demonstrations = utils.rollout_from_usa(5, 10, u_sa, mdp_env)
    print(demonstrations)


    beta = 100.0
    step_stdev = 0.01
    birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False)

    
    map_w, map_u, r_chain, u_chain = birl.sample_posterior(demonstrations, 10000)
    print("map_weights", map_w)
    map_r = np.dot(mdp_env.state_features, map_w)
    utils.print_as_grid(map_r, mdp_env)
    print("Map policy")
    utils.print_policy_from_occupancies(map_u, mdp_env)

    # print("chain")
    # for r in r_chain:
    #     print(r)

    worst_index = np.argmin(r_chain[:,1])
    print(r_chain[worst_index])
    print(np.sum(r_chain[:,1] < -0.82))
    
Exemplo n.º 8
0
###BIRL
beta = 10.0
step_stdev = 0.2
burn = 50
skip = 5
num_samples = 200
mcmc_norm = "l2"
likelihood = "birl"

mdp_env = mdp_worlds.lava_ambiguous_corridor()
opt_sa = mdp.solve_mdp_lp(mdp_env)


print("Cliff world")
print("Optimal Policy")
utils.print_policy_from_occupancies(opt_sa, mdp_env)
print("reward")
utils.print_as_grid(mdp_env.r_s, mdp_env)
print("features")
utils.display_onehot_state_features(mdp_env)

init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1)
traj_demonstrations = []
demo_set = set()
for d in range(num_demos):
    # np.random.seed(init_seed + d)
    # random.seed(init_seed + d)
    s = init_demo_state #mdp_env.init_states[0] # only one initial state
    demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env)
    print("demo", d, demo)
    traj_demonstrations.append(demo)
###BIRL
beta = 10.0
step_stdev = 0.2
burn = 500
skip = 5
num_samples = 2000
mcmc_norm = "l2"
likelihood = "birl"

mdp_env = mdp_worlds.lava_ambiguous_corridor()
opt_sa = mdp.solve_mdp_lp(mdp_env)


print("Cliff world")
print("Optimal Policy")
utils.print_policy_from_occupancies(opt_sa, mdp_env)
print("reward")
utils.print_as_grid(mdp_env.r_s, mdp_env)
print("features")
utils.display_onehot_state_features(mdp_env)

init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1)
traj_demonstrations = []
demo_set = set()
for d in range(num_demos):
    # np.random.seed(init_seed + d)
    # random.seed(init_seed + d)
    s = init_demo_state #mdp_env.init_states[0] # only one initial state
    demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env)
    print("demo", d, demo)
    traj_demonstrations.append(demo)
Exemplo n.º 10
0
# mdp_env_B = mdp_worlds.lava_ambiguous_ird_fig2b()

# u_sa_B = mdp.solve_mdp_lp(mdp_env_B)

# print("mdp B")
# print("Policy")
# utils.print_policy_from_occupancies(u_sa_B, mdp_env_B)
# print("reward")
# utils.print_as_grid(mdp_env_B.r_s, mdp_env_B)


#let's try out BIRL on a simpler version and see what happens

#first let's give a demo in the A version that doesn't have lava

mdp_env_B = mdp_worlds.lava_ird_simplified_b()
map_w = np.array([-0.30380369, -0.9159926,   0.10477373,  0.24017357])

print("MAP")
print("map_weights", map_w)
map_r = np.dot(mdp_env_B.state_features, map_w)
print("map reward")
utils.print_as_grid(map_r, mdp_env_B)
#compute new policy for mdp_B for map rewards
map_r_sa = mdp_env_B.transform_to_R_sa(map_w)
map_u_sa = mdp.solve_mdp_lp(mdp_env_B, reward_sa=map_r_sa) #use optional argument to replace standard rewards with sample
print("Map policy")
utils.print_policy_from_occupancies(map_u_sa, mdp_env_B)

Exemplo n.º 11
0
    def sample_posterior(self,
                         demonstrations,
                         num_samples,
                         print_map_updates=False):
        #TODO: may require preprocessing of demos since this requires them to be a list of state-action pairs
        demos_sa = []
        if type(demonstrations[0]) is tuple and len(demonstrations[0]) == 2:
            #each element in demonstrations is a state-action pair so no preprocessing needed

            demos_sa = demonstrations

        else:
            #assume we have a list of lists of state-action tuples
            for d in demonstrations:
                for s, a in d:
                    demos_sa.append(s, a)

        #only save reward functions and occupancy_frequencies
        reward_samples = []
        occupancy_frequencies = []
        map_weights = None
        map_occupancy = None
        #find size of reward function

        #sample random reward hypothesis to start
        curr_weights = self.sample_init_reward()
        #print(curr_weights)
        curr_occupancies, curr_q_values = self.solve_optimal_policy(
            curr_weights)
        #compute log likelihood over demonstrations
        curr_ll = self.log_likelihood(curr_weights, curr_q_values, demos_sa)
        best_ll = -np.inf

        #run MCMC
        accept_cnt = 0
        for step in range(num_samples):
            if self.debug: print("\n------\nstep", step)
            #compute proposal and log likelihood
            proposal_weights = self.generate_proposal_weights(curr_weights)

            #compute q_values and occupancy frequencies
            proposal_occupancies, proposal_q_values = self.solve_optimal_policy(
                proposal_weights)

            if self.debug: print("proposal reward", proposal_weights)
            if self.debug: print("proposal qvalues", proposal_q_values)
            if self.debug:
                utils.print_policy_from_occupancies(proposal_occupancies,
                                                    self.mdp_env)

            prop_ll = self.log_likelihood(proposal_weights, proposal_q_values,
                                          demos_sa)
            if self.debug: print("prop_ll", prop_ll, "curr_ll", curr_ll)
            prob_accept = min(1.0, np.exp(prop_ll - curr_ll))
            if self.debug: print("prob accept", prob_accept)
            rand_sample = np.random.rand()
            if self.debug: print("rand prob", rand_sample)
            if rand_sample < prob_accept:
                accept_cnt += 1
                if self.debug: print("accept")
                #accept and add to chain
                reward_samples.append(proposal_weights)
                occupancy_frequencies.append(proposal_occupancies)
                curr_ll = prop_ll
                curr_weights = proposal_weights
                curr_occupancies = proposal_occupancies
                #update MAP
                if prop_ll > best_ll:
                    if print_map_updates: print(step)
                    if self.debug:
                        utils.print_policy_from_occupancies(
                            proposal_occupancies, self.mdp_env)
                    if self.debug: print("Q(s,a)", proposal_q_values)
                    best_ll = prop_ll
                    map_weights = proposal_weights.copy()
                    map_occupancy = proposal_occupancies.copy()
                    if print_map_updates:
                        print("w_map", map_weights,
                              "loglik = {:.4f}".format(best_ll))
            else:
                if self.debug: print("reject")
                reward_samples.append(curr_weights)
                occupancy_frequencies.append(curr_occupancies)
            #print out last reward sampled
            #print(reward_samples[-1])

        print("w_map", map_weights, "loglik", best_ll)
        print("accepted/total = {}/{} = {}".format(accept_cnt, num_samples,
                                                   accept_cnt / num_samples))
        # if best_ll < -10:
        #     input("Didn't seem to converge... Check likelihoods and demos... Continue?")
        return map_weights, map_occupancy, np.array(reward_samples), np.array(
            occupancy_frequencies)
    np.random.seed(seed)
    random.seed(seed)

    #train mdp
    mdp_env_A = mdp_worlds.lavaland_smaller(contains_lava=False)
    #test mdp ((probably) has lava)
    mdp_env_B = mdp_worlds.lavaland_smaller(contains_lava=True)

    u_sa_A = mdp.solve_mdp_lp(mdp_env_A)

    print("===========================")
    print("Training MDP with No Lava")
    print("===========================")

    print("Optimal Policy")
    utils.print_policy_from_occupancies(u_sa_A, mdp_env_A)
    print("reward")
    utils.print_as_grid(mdp_env_A.r_s, mdp_env_A)
    print("features")
    utils.display_onehot_state_features(mdp_env_A)

    #generate demonstration from top left corner
    traj_demonstrations = []
    demo_set = set()
    for s in demo_states:  #range(mdp_env_A.get_num_states()):
        if mdp_env_A.init_dist[s] > 0:
            demo = utils.rollout_from_usa(s, demo_horizon, u_sa_A, mdp_env_A)
            traj_demonstrations.append(demo)
            for s_a in demo:
                demo_set.add(s_a)
    demonstrations = list(demo_set)
Exemplo n.º 13
0
num_rows = 10
num_cols = 10
num_features = 6

train_mdp = mdp_worlds.negative_sideeffects_goal(num_rows,
                                                 num_cols,
                                                 num_features,
                                                 unseen_feature=False)
train_mdp.set_reward_fn(np.array([-.1, -.6, -.1, -0.6, -2, 0]))
opt_sa_train = mdp.solve_mdp_lp(train_mdp)
print("===========================")
print("Training MDP with No Lava")
print("===========================")

print("Optimal Policy")
utils.print_policy_from_occupancies(opt_sa_train, train_mdp)
print("reward")
utils.print_as_grid(train_mdp.r_s, train_mdp)
print("features")
utils.display_onehot_state_features(train_mdp)

import numpy as np

np.random.randint(60)

init_demo_states = [0, 9, 90, 99]  #mdp_env.num_cols * (mdp_env.num_rows - 1)
traj_demonstrations = []
demo_set = set()
for d in range(num_demos):
    # np.random.seed(init_seed + d)
    # random.seed(init_seed + d)