Пример #1
0
                   learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205)

    pkl.dump(r, open("maxent_reward.pkl", 'wb'))

    return r


if __name__ == '__main__':
    train(0.01, 1, 400, 0.01)
    rewards = pkl.load(open("maxent_reward.pkl", 'rb'))

    env = Env(prepare_tp=True)

    value = vi.value(env.get_policy(), env.n_states,
                     env.transition_probability, rewards, 0.3)
    opt_value = vi.optimal_value(env.n_states, env.n_actions,
                                 env.transition_probability, rewards, 0.3)
    pkl.dump(value, open("maxent_value.pkl", 'wb'))
    pkl.dump(opt_value, open("maxent_opt_value.pkl", 'wb'))

    value = pkl.load(open("maxent_value.pkl", 'rb'))
    opt_value = pkl.load(open("maxent_opt_value.pkl", 'rb'))

    status = validate(value)
    print(status)
    pkl.dump(status, open("maxent_status.pkl", 'wb'))
    status = validate(opt_value)
    print(status)
    pkl.dump(status, open("maxent_opt_status.pkl", 'wb'))
    status = validate(rewards)
    print(status)
    pkl.dump(status, open("maxent_rewards_status.pkl", 'wb'))
def test_gw_once(grid_size, feature_map, n_samples, epochs, structure):
    """
    Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature
    map feature_map with n_samples paths.

    grid_size: Grid size. int.
    feature_map: Which feature map to use. String in {ident, coord, proxi}.
    n_samples: Number of paths to sample.
    epochs: Number of epochs to run MaxEnt with.
    structure: Neural network structure tuple, e.g. (3, 3) would be a
        3-layer neural network with assumed inputs.
    -> Expected value difference for MaxEnt, DeepMaxEnt
    """

    # Basic gist of what we're doing here: Get the reward function using our
    # different IRL methods, use those to get a policy, evaluate that policy
    # using the true reward, and then return the difference in expected values.

    # Setup parameters.
    wind = 0.3
    discount = 0.9
    learning_rate = 0.01
    trajectory_length = 3*grid_size

    # Make the gridworld and associated data.
    gw = Gridworld(grid_size, wind, discount)
    feature_matrix = gw.feature_matrix(feature_map)
    ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)])
    optimal_policy = value_iteration.find_policy(gw.n_states,
                                                 gw.n_actions,
                                                 gw.transition_probability,
                                                 ground_reward,
                                                 discount).argmax(axis=1)
    trajectories = gw.generate_trajectories(n_samples,
                                            trajectory_length,
                                            optimal_policy.take)
    p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
                     trajectories.shape[0])

    # True value.
    optimal_V = value_iteration.optimal_value(gw.n_states,
                                              gw.n_actions,
                                              gw.transition_probability,
                                              ground_reward, gw.discount)

    # MaxEnt reward; policy; value.
    maxent_reward = deep_maxent.irl((feature_matrix.shape[1],),
                                    feature_matrix,
                                    gw.n_actions,
                                    gw.discount,
                                    gw.transition_probability,
                                    trajectories, epochs, learning_rate)

    maxent_policy = value_iteration.find_policy(gw.n_states,
                                                gw.n_actions,
                                                gw.transition_probability,
                                                maxent_reward,
                                                discount).argmax(axis=1)
    maxent_V = value_iteration.value(maxent_policy,
                                     gw.n_states,
                                     gw.transition_probability,
                                     ground_reward,
                                     gw.discount)
    maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)

    # DeepMaxEnt reward; policy; value.
    deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure,
                                         feature_matrix,
                                         gw.n_actions,
                                         gw.discount,
                                         gw.transition_probability,
                                         trajectories, epochs, learning_rate)
    deep_maxent_policy = value_iteration.find_policy(gw.n_states,
                                                     gw.n_actions,
                                                     gw.transition_probability,
                                                     deep_maxent_reward,
                                                     discount).argmax(axis=1)
    deep_maxent_V = value_iteration.value(deep_maxent_policy,
                                          gw.n_states,
                                          gw.transition_probability,
                                          ground_reward,
                                          gw.discount)
    deep_maxent_EVD = (optimal_V.dot(p_start_state) -
                       deep_maxent_V.dot(p_start_state))

    plt.subplot(3, 3, 1)
    plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 2)
    plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
    plt.title("MaxEnt reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 3)
    plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)

    plt.subplot(3, 3, 4)
    plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("Optimal policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 5)
    plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("MaxEnt policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 6)
    plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
               vmin=0, vmax=3)
    plt.title("DeepMaxEnt policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)

    plt.subplot(3, 3, 7)
    plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
    plt.title("Optimal value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 8)
    plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
    plt.title("MaxEnt value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 9)
    plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map,
        n_samples, epochs, structure, np.random.randint(10000000)))


    return maxent_EVD, deep_maxent_EVD
def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples,
                 epochs, structure):
    """
    Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature
    map feature_map with n_samples paths.

    grid_size: Grid size. int.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    discrete: Whether the features should be discrete. bool.
    l1: L1 regularisation. float.
    l2: L2 regularisation. float.
    n_samples: Number of paths to sample.
    epochs: Number of epochs to run MaxEnt with.
    structure: Neural network structure tuple, e.g. (3, 3) would be a
        3-layer neural network with assumed inputs.
    -> Expected value difference for MaxEnt, DeepMaxEnt
    """

    # Basic gist of what we're doing here: Get the reward function using our
    # different IRL methods, use those to get a policy, evaluate that policy
    # using the true reward, and then return the difference in expected values.

    # Setup parameters.
    wind = 0.3
    discount = 0.9
    learning_rate = 0.01
    trajectory_length = 3 * grid_size

    # Make the objectworld and associated data.
    ow = Objectworld(grid_size, n_objects, n_colours, wind, discount)
    feature_matrix = ow.feature_matrix(discrete)
    ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)])
    optimal_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                 ow.transition_probability,
                                                 ground_reward,
                                                 discount).argmax(axis=1)
    trajectories = ow.generate_trajectories(n_samples, trajectory_length,
                                            optimal_policy.take)
    p_start_state = (
        np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
        trajectories.shape[0])

    # True value.
    optimal_V = value_iteration.optimal_value(ow.n_states, ow.n_actions,
                                              ow.transition_probability,
                                              ground_reward, ow.discount)

    # MaxEnt reward; policy; value.
    maxent_reward = deep_maxent.irl((feature_matrix.shape[1], ),
                                    feature_matrix,
                                    ow.n_actions,
                                    ow.discount,
                                    ow.transition_probability,
                                    trajectories,
                                    epochs,
                                    learning_rate,
                                    l1=l1,
                                    l2=l2)

    maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                ow.transition_probability,
                                                maxent_reward,
                                                discount).argmax(axis=1)
    maxent_V = value_iteration.value(maxent_policy, ow.n_states,
                                     ow.transition_probability, ground_reward,
                                     ow.discount)
    maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)

    # DeepMaxEnt reward; policy; value.
    deep_learning_rate = 0.005  # For the 32 x 32 experiments.
    deep_maxent_reward = deep_maxent.irl(
        (feature_matrix.shape[1], ) + structure,
        feature_matrix,
        ow.n_actions,
        ow.discount,
        ow.transition_probability,
        trajectories,
        epochs,
        deep_learning_rate,
        l1=l1,
        l2=l2)

    deep_maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                     ow.transition_probability,
                                                     deep_maxent_reward,
                                                     discount).argmax(axis=1)
    deep_maxent_V = value_iteration.value(deep_maxent_policy, ow.n_states,
                                          ow.transition_probability,
                                          ground_reward, ow.discount)

    deep_maxent_EVD = (optimal_V.dot(p_start_state) -
                       deep_maxent_V.dot(p_start_state))

    plt.subplot(3, 3, 1)
    plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 2)
    plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
    plt.title("MaxEnt reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 3)
    plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)

    plt.subplot(3, 3, 4)
    plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("Optimal policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 5)
    plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("MaxEnt policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 6)
    plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
               vmin=0,
               vmax=3)
    plt.title("DeepMaxEnt policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)

    plt.subplot(3, 3, 7)
    plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
    plt.title("Optimal value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 8)
    plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
    plt.title("MaxEnt value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 9)
    plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format(
        grid_size, n_objects, n_colours, discrete, n_samples, epochs,
        structure, l1, l2, np.random.randint(10000000)))

    return maxent_EVD, deep_maxent_EVD
Пример #4
0
def main(grid_size,
         discount,
         n_objects,
         n_colours,
         n_trajectories,
         epochs,
         learning_rate,
         start_state,
         wind=0.0,
         algo="maxnet",
         mdp="gridworld"):
    """
    Run inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    start_state: start location to generate trajectory from
    algo: IRL algo to run (Currently, support maxnet and deep_maxnet)
    """

    sx, sy = start_state
    trajectory_length = 8

    if mdp == "objectworld":
        import irl.mdp.objectworld as objectworld
        ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                     discount)
    elif mdp == "gridworld":
        import irl.mdp.gridworld as gridworld
        ow = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    optimal_v = optimal_value(ow.n_states,
                              ow.n_actions, ow.transition_probability,
                              normalize(ground_r), ow.discount)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s],
                                            random_start=True)

    feature_matrix = ow.feature_matrix()

    print("trajectories = ", trajectories.shape)
    print("epochs = ", epochs)
    print("feature_matrix.shape = ", feature_matrix.shape)
    print("policy.shape = ", policy.shape)
    #    ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind), value=optimal_v)
    ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                    epochs, wind),
                 policy=policy,
                 value=optimal_v)

    r = []
    ground_svf = []
    if algo == "maxent":
        import irl.maxent as maxent
        ground_svf = maxent.find_svf(ow.n_states, trajectories)
        r = maxent.irl(feature_matrix, ow.n_actions, discount,
                       ow.transition_probability, trajectories, epochs,
                       learning_rate)
    elif algo == "deep_maxnet":
        import irl.deep_maxent as deep_maxent
        l1 = l2 = 0
        structure = (3, 3)
        r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                            feature_matrix,
                            ow.n_actions,
                            discount,
                            ow.transition_probability,
                            trajectories,
                            epochs,
                            learning_rate,
                            l1=l1,
                            l2=l2)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   normalize(r),
                                   ow.discount,
                                   stochastic=False)
    recovered_v = value(recovered_policy, ow.n_states,
                        ow.transition_probability, normalize(r), ow.discount)

    new_trajectory = ow.generate_trajectories(n_trajectories,
                                              trajectory_length,
                                              lambda s: recovered_policy[s],
                                              True, (sx, sy))
    recovered_svf = maxent.find_svf(ow.n_states, new_trajectory)

    #    ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind),
    #                                value=recovered_v)
    ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                 policy=recovered_policy,
                 value=recovered_v)

    #    print("new trajectory")
    #    for t in new_trajectory:
    #        for s, a, rw in t:
    #            print (ow.int_to_point(s), ow.actions[a], rw)
    #        print ("---------")
    y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5]

    plt.subplot(111)

    plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth SVF")
    plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                       epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size)))
    plt.title("Recovered SVF")
    plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size)))
    plt.title("Recovered reward")
    plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)