step_stdev,
                                    debug=False,
                                    mcmc_norm=mcmc_norm)
    map_w, map_u_sa, w_chain, u_chain = birl.sample_posterior(
        demonstrations, num_samples, False)
    print("Birl complete")

    if debug:
        print("-------")
        print("true weights", true_w)
        print("features")
        utils.display_onehot_state_features(mdp_env)
        print("optimal policy")
        utils.print_policy_from_occupancies(opt_u_sa, mdp_env)
        print("optimal values")
        v = mdp.get_state_values(opt_u_sa, mdp_env)
        utils.print_as_grid(v, mdp_env)

    if debug:
        print("map_weights", map_w)
        map_r = np.dot(mdp_env.state_features, map_w)
        print("MAP reward")
        utils.print_as_grid(map_r, mdp_env)
        print("Map policy")
        utils.print_policy_from_occupancies(map_u_sa, mdp_env)

    w_chain_burned = w_chain[burn::skip]

    ###compute mean reward policy

    mean_w = np.mean(w_chain_burned, axis=0)
예제 #2
0
    weights = utils.sample_l2_ball(num_features)

    print("weights", weights)
    gamma = 0.99
    #let's look at all starting states for now
    init_dist = np.ones(num_states) / num_states
    # init_states = [10]
    # for si in init_states:
    #     init_dist[si] = 1.0 / len(init_states)

    #no terminal
    term_states = []

    mdp_env = mdp.FeaturizedGridMDP(num_rows, num_cols, state_features,
                                    weights, gamma, init_dist, term_states)
    return mdp_env


if __name__ == "__main__":
    #mdp_env = machine_teaching_toy_featurized()
    # mdp_env = lava_ambiguous_aaai18()
    mdp_env = random_gridworld_corner_terminal(6, 6, 5)
    print("features")
    utils.display_onehot_state_features(mdp_env)
    u_sa = mdp.solve_mdp_lp(mdp_env, debug=True)
    print("optimal policy")
    utils.print_policy_from_occupancies(u_sa, mdp_env)
    print("optimal values")
    v = mdp.get_state_values(u_sa, mdp_env)
    utils.print_as_grid(v, mdp_env)
    init_distribution = np.ones(num_states)/num_states  #uniform distribution
    mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma, init_distribution)
    #print(mdp_env.Ps)
    print("mean MDP reward", r_sa)

    u_sa = mdp.solve_mdp_lp(mdp_env, debug=True)
    print("mean policy from posterior")
    utils.print_stochastic_policy_action_probs(u_sa, mdp_env)
    print("MAP/Mean policy from posterior")
    utils.print_policy_from_occupancies(u_sa, mdp_env) 
    print("rewards")
    print(mdp_env.r_sa)
    print("expected value = ", np.dot(u_sa, r_sa))
    stoch_pi = utils.get_optimal_policy_from_usa(u_sa, mdp_env)
    print("expected return", mdp.get_policy_expected_return(stoch_pi, mdp_env))
    print("values", mdp.get_state_values(u_sa, mdp_env))
    print('q-values', mdp.get_q_values(u_sa, mdp_env))

    
    
    #print(posterior)
    #print(posterior.shape)


    #run CVaR optimization, maybe just the robust version for now
    u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states)
    
    # print("solving for CVaR optimal policy")
    posterior_probs = np.ones(num_samples) / num_samples  #uniform dist since samples from MCMC
    # cvar_opt_usa, cvar, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, posterior, posterior_probs, alpha, False, lamda)