示例#1
0
def expected_value_difference(n_states, n_actions, transition_probability,
                              reward, discount, p_start_state, optimal_value,
                              true_reward):
    """
    Calculate the expected value difference, which is a proxy to how good a
    recovered reward function is.

    n_states: Number of states. int.
    n_actions: Number of actions. int.
    transition_probability: NumPy array mapping (state_i, action, state_k) to
        the probability of transitioning from state_i to state_k under action.
        Shape (N, A, N).
    reward: Reward vector mapping state int to reward. Shape (N,).
    discount: Discount factor. float.
    p_start_state: Probability vector with the ith component as the probability
        that the ith state is the start state. Shape (N,).
    optimal_value: Value vector for the ground reward with optimal policy.
        The ith component is the value of the ith state. Shape (N,).
    true_reward: True reward vector. Shape (N,).
    -> Expected value difference. float.
    """

    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, reward,
                                         discount)
    value = value_iteration.value(policy.argmax(axis=1), n_states,
                                  transition_probability, true_reward,
                                  discount)

    evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
    return evd
示例#2
0
def main(grid_size, discount, L):
    wind = 0.3
    trajectory_length = 3 * grid_size
    gw = gridworld.Gridworld(grid_size, wind, discount)
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    #policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]#由确定性最优策略(自己预先设置的)求R
    #由强化学习求最优策略
    policy = find_policy(
        gw.n_states,
        gw.n_actions,
        gw.transition_probability,
        ground_r,
        discount,
    )
    # Need a value function for each basis function.
    feature_matrix = gw.feature_matrix()
    values = []
    for dim in range(feature_matrix.shape[1]):
        reward = feature_matrix[:, dim]
        values.append(
            value(policy, gw.n_states, gw.transition_probability, reward,
                  gw.discount))
    values = np.array(values).T

    rl1, rl2, rl1l2 = linear_irl.large_irl(values, gw.transition_probability,
                                           feature_matrix, gw.n_states,
                                           gw.n_actions, policy, L)
    return ground_r, rl1, rl2, rl1l2
示例#3
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, structure):
    """
    Run deep maximum entropy inverse reinforcement learning on the objectworld
    MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
        () is no neural network (linear maximum entropy) and (3, 4) is two
        hidden layers with dimensions 3 and 4.
    """

    wind = 0.3
    trajectory_length = 8
    l1 = l2 = 0

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])
    feature_matrix = ow.feature_matrix(discrete=False)
    r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                        feature_matrix,
                        ow.n_actions,
                        discount,
                        ow.transition_probability,
                        trajectories,
                        epochs,
                        learning_rate,
                        l1=l1,
                        l2=l2)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
示例#4
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    print("ow.n_states", ow.n_states)
    print("ow.n_actions", ow.n_actions)
    print("ow.transition_probability", ow.transition_probability,
          len(ow.transition_probability), len(ow.transition_probability[0]),
          len(ow.transition_probability[0][0]))
    print("ground_r", ground_r, len(ground_r))
    print("ow.discount", ow.discount)

    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])
    print(trajectories)
    feature_matrix = ow.feature_matrix(discrete=False)
    print("feature_matrix", feature_matrix, len(feature_matrix),
          len(feature_matrix[0]))

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
示例#5
0
def find_expected_svf(n_states, r, n_actions, discount,
                      transition_probability, trajectories):
    '''
    从轨迹中找出期望状态的访问频率,svf是啥——state visitation frequencies
    :param n_states:
    :param r:
    :param n_actions:
    :param discount:
    :param transition_probability:
    :param trajectories:
    :return:
    '''
    """
    Find the expected state visitation frequencies using algorithm 1 from
    Ziebart et al. 2008.

    n_states: Number of states N. int.
    alpha: Reward. NumPy array with shape (N,).
    n_actions: Number of actions A. int.
    discount: Discount factor of the MDP. float.
    transition_probability: NumPy array mapping (state_i, action, state_k) to
        the probability of transitioning from state_i to state_k under action.
        Shape (N, A, N).
    trajectories: 3D array of state/action pairs. States are ints, actions
        are ints. NumPy array with shape (T, L, 2) where T is the number of
        trajectories and L is the trajectory length.
    -> Expected state visitation frequencies vector with shape (N,).
    """

    n_trajectories = trajectories.shape[0]#轨迹数量
    trajectory_length = trajectories.shape[1]#轨迹长度

    # policy = find_policy(n_states, r, n_actions, discount,
    #                                 transition_probability)
    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, r, discount)
    #输出为每个状态的价值
    start_state_count = np.zeros(n_states)
    for trajectory in trajectories:
        start_state_count[trajectory[0, 0]] += 1
    p_start_state = start_state_count/n_trajectories

    expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T
    for t in range(1, trajectory_length):
        expected_svf[:, t] = 0
        for i, j, k in product(range(n_states), range(n_actions), range(n_states)):
            expected_svf[k, t] += (expected_svf[i, t-1] *
                                  policy[i, j] * # Stochastic policy
                                  transition_probability[i, j, k])

    return expected_svf.sum(axis=1)
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure):
    """
    Run deep maximum entropy inverse reinforcement learning on the objectworld
    MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
        () is no neural network (linear maximum entropy) and (3, 4) is two
        hidden layers with dimensions 3 and 4.
    """

    wind = 0.3
    trajectory_length = 8
    l1 = l2 = 0

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s])
    feature_matrix = ow.feature_matrix(discrete=False)
    r = deep_maxent.irl(
        (feature_matrix.shape[1],) + structure,
        feature_matrix,
        ow.n_actions,
        discount,
        ow.transition_probability,
        trajectories,
        epochs,
        learning_rate,
        l1=l1,
        l2=l2,
    )

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
示例#7
0
def main(grid_size, discount, L, trust):  #L正则化系数
    wind = 1 - trust  #专家随机动作系数,
    trajectory_length = 3 * grid_size  #最大轨迹长度
    gw = gridworld.Gridworld(grid_size, wind, discount)
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])  #真实奖赏函数
    #policy = [gw.optimal_policy_stochastic(s) for s in range(gw.n_states)]   #采用随机(非确定性)策略,效果没那么好
    #policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] #采用确定性策略,效果好
    # 由强化学习求最优策略
    policy = find_policy(
        gw.n_states,
        gw.n_actions,
        gw.transition_probability,
        ground_r,
        discount,
    )
    rl1, rl2, rl1l2 = linear_irl.irl(gw.n_states, gw.n_actions,
                                     gw.transition_probability, policy,
                                     gw.discount, 1, L)  #Rmax=1,L1可变
    return ground_r, rl1, rl2, rl1l2
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s])
    feature_matrix = ow.feature_matrix(discrete=False)
    r = maxent.irl(
        feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate
    )

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples,
                 epochs, structure):
    """
    Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature
    map feature_map with n_samples paths.

    grid_size: Grid size. int.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    discrete: Whether the features should be discrete. bool.
    l1: L1 regularisation. float.
    l2: L2 regularisation. float.
    n_samples: Number of paths to sample.
    epochs: Number of epochs to run MaxEnt with.
    structure: Neural network structure tuple, e.g. (3, 3) would be a
        3-layer neural network with assumed inputs.
    -> Expected value difference for MaxEnt, DeepMaxEnt
    """

    # Basic gist of what we're doing here: Get the reward function using our
    # different IRL methods, use those to get a policy, evaluate that policy
    # using the true reward, and then return the difference in expected values.

    # Setup parameters.
    wind = 0.3
    discount = 0.9
    learning_rate = 0.01
    trajectory_length = 3 * grid_size

    # Make the objectworld and associated data.
    ow = Objectworld(grid_size, n_objects, n_colours, wind, discount)
    feature_matrix = ow.feature_matrix(discrete)
    ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)])
    optimal_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                 ow.transition_probability,
                                                 ground_reward,
                                                 discount).argmax(axis=1)
    trajectories = ow.generate_trajectories(n_samples, trajectory_length,
                                            optimal_policy.take)
    p_start_state = (
        np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
        trajectories.shape[0])

    # True value.
    optimal_V = value_iteration.optimal_value(ow.n_states, ow.n_actions,
                                              ow.transition_probability,
                                              ground_reward, ow.discount)

    # MaxEnt reward; policy; value.
    maxent_reward = deep_maxent.irl((feature_matrix.shape[1], ),
                                    feature_matrix,
                                    ow.n_actions,
                                    ow.discount,
                                    ow.transition_probability,
                                    trajectories,
                                    epochs,
                                    learning_rate,
                                    l1=l1,
                                    l2=l2)

    maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                ow.transition_probability,
                                                maxent_reward,
                                                discount).argmax(axis=1)
    maxent_V = value_iteration.value(maxent_policy, ow.n_states,
                                     ow.transition_probability, ground_reward,
                                     ow.discount)
    maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)

    # DeepMaxEnt reward; policy; value.
    deep_learning_rate = 0.005  # For the 32 x 32 experiments.
    deep_maxent_reward = deep_maxent.irl(
        (feature_matrix.shape[1], ) + structure,
        feature_matrix,
        ow.n_actions,
        ow.discount,
        ow.transition_probability,
        trajectories,
        epochs,
        deep_learning_rate,
        l1=l1,
        l2=l2)

    deep_maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                     ow.transition_probability,
                                                     deep_maxent_reward,
                                                     discount).argmax(axis=1)
    deep_maxent_V = value_iteration.value(deep_maxent_policy, ow.n_states,
                                          ow.transition_probability,
                                          ground_reward, ow.discount)

    deep_maxent_EVD = (optimal_V.dot(p_start_state) -
                       deep_maxent_V.dot(p_start_state))

    plt.subplot(3, 3, 1)
    plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 2)
    plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
    plt.title("MaxEnt reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 3)
    plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)

    plt.subplot(3, 3, 4)
    plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("Optimal policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 5)
    plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("MaxEnt policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 6)
    plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
               vmin=0,
               vmax=3)
    plt.title("DeepMaxEnt policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)

    plt.subplot(3, 3, 7)
    plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
    plt.title("Optimal value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 8)
    plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
    plt.title("MaxEnt value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 9)
    plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format(
        grid_size, n_objects, n_colours, discrete, n_samples, epochs,
        structure, l1, l2, np.random.randint(10000000)))

    return maxent_EVD, deep_maxent_EVD
示例#10
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, start_state):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    sx, sy = start_state
    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)

    ow.plot_grid()

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)

    print("Policy = ", policy.shape)
    #    print ("policy - {}".format(policy))
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])

    print("trajectories = ", trajectories.shape)
    #    for t in trajectories:
    #        ow.plot_grid("trajectory_{}.png".format(t), t)
    #    for t in trajectories:
    #        for s, a, r in t:
    #            print (ow.int_to_point(s), ow.actions[a], r)
    #        print ("---------")

    feature_matrix = ow.feature_matrix(discrete=False)

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   r,
                                   ow.discount,
                                   stochastic=False)

    new_trajectory = ow.generate_trajectories(1, trajectory_length,
                                              lambda s: recovered_policy[s],
                                              False, (sx, sy))
    print("new trajectory")
    for t in new_trajectory:
        ow.plot_grid("new_trajectory.png", t)
        for s, a, rw in t:
            print(ow.int_to_point(s), ow.actions[a], rw)
        print("---------")
    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.savefig("reward.png", format="png", dpi=150)
示例#11
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.1  #模拟干扰,噪声,专家出错导致动作非最优的概率
    trajectory_length = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    # 由强化学习求最优策略让它代表专家策略产生示例轨迹
    policy = find_policy(gw.n_states, gw.n_actions, gw.transition_probability,
                         ground_r, discount)
    trajectories = gw.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            policy,
                                            random_start=True)
    # 画轨迹图 预处理前
    paths = []
    for i in trajectories:
        path = [j[0] for j in i]
        paths.append(path)
    draw_path(gw.grid_size, paths, '预处理前专家示例轨迹')
    # 预处理专家轨迹
    new_trajectories = pre_treated(gw.n_states, gw.n_actions, trajectories)
    # 画轨迹图 预处理后
    paths = []
    for i in new_trajectories:
        path = [j[0] for j in i]
        paths.append(path)
    draw_path(gw.grid_size, paths, '预处理后专家示例轨迹')

    feature_matrix = gw.feature_matrix()
    trajectories = [[(s, a, r) for (s, a, r, _) in trajectory]
                    for trajectory in trajectories]  # maxent irl处理的格式
    r1, R1 = maxent.irl(feature_matrix, gw.n_actions,
                        discount, gw.transition_probability,
                        np.array(trajectories), epochs, learning_rate)
    r1 = r1 / max(r1)
    loss1 = []
    for r in R1:
        r = r / max(r)
        loss = abs(r - ground_r).sum()
        loss1.append(loss)

    new_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory]
                        for trajectory in new_trajectories]  # maxent irl处理的格式
    feature_matrix = gw.feature_matrix()
    r2, R2 = maxent.irl(feature_matrix, gw.n_actions,
                        discount, gw.transition_probability,
                        np.array(new_trajectories), epochs, learning_rate)
    r2 = r2 / max(r2)
    loss2 = []
    for r in R2:
        r = r / max(r)
        loss = abs(r - ground_r).sum()
        loss2.append(loss)
    # 监督学习
    policy_sl = supervised_learning(new_trajectories, policy)  # 监督学习
    equal = 0
    for i in range(len(policy)):
        if policy_sl[i] == policy[i]:
            equal += 1 / len(policy)
    print("监督学习得到的策略正确率{}%".format(100 * equal))
    # 由监督学习策略生成轨迹
    sl_trajectories = gw.generate_trajectories(n_trajectories,
                                               trajectory_length,
                                               policy_sl,
                                               random_start=True)
    # 预处理监督学习策略轨迹
    new_sl_trajectories = pre_treated(gw.n_states, gw.n_actions,
                                      sl_trajectories)
    # 画轨迹图 监督学习策略
    paths = []
    for i in new_sl_trajectories:
        path = [j[0] for j in i]
        paths.append(path)
    draw_path(gw.grid_size, paths, '监督学习策略估计出的专家轨迹')
    new_sl_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory]
                           for trajectory in new_sl_trajectories]
    mix_trajectories = new_trajectories
    for trajectory in new_sl_trajectories:
        for i in new_trajectories:
            if trajectory[-1] == i[-1]:
                mix_trajectories.append(trajectory)
                break
    feature_matrix = gw.feature_matrix()
    r3, R3 = maxent.irl(feature_matrix, gw.n_actions,
                        discount, gw.transition_probability,
                        np.array(mix_trajectories), epochs, learning_rate)
    r3 = r3 / max(r3)
    loss3 = []
    for r in R3:
        r = r / max(r)
        loss = abs(r - ground_r).sum()
        loss3.append(loss)
    # # 2维图
    # plt.subplot(1, 3, 1)
    # plt.pcolor(r1.reshape((grid_size, grid_size)))
    # plt.colorbar()
    # plt.title("未进行预处理恢复的R")
    # plt.subplot(1, 3, 2)
    # plt.pcolor(r2.reshape((grid_size, grid_size)))
    # plt.colorbar()
    # plt.title("进行预处理恢复的R")
    # plt.subplot(1, 3, 3)
    # plt.pcolor(r3.reshape((grid_size, grid_size)))
    # plt.colorbar()
    # plt.title("预处理且监督学习恢复的R")
    # plt.show()

    # 画三维图
    # 绘图设置

    # X和Y的个数要相同
    X = range(gw.grid_size)
    Y = range(gw.grid_size)
    Z1 = r1
    Z2 = r2
    Z3 = r3
    # meshgrid把X和Y变成平方长度,比如原来都是4,经过meshgrid和ravel之后,长度都变成了16,因为网格点是16个
    xx, yy = np.meshgrid(X, Y)  # 网格化坐标
    X, Y = xx.ravel(), yy.ravel()  # 矩阵扁平化
    # # 设置柱子属性
    height = np.zeros_like(Z1)  # 新建全0数组,shape和Z相同,据说是图中底部的位置
    width = depth = 1  # 柱子的长和宽
    # # 颜色数组,长度和Z一致
    c = ['y'] * len(Z1)

    # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换
    fig = plt.figure()
    ax = fig.gca(projection='3d')  # 三维坐标轴
    ax.bar3d(X, Y, height, width, depth, Z1, color=c,
             shade=True)  # width, depth, height
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('reward_vale')
    plt.title("未进行预处理恢复的R")
    plt.show()

    # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换
    fig = plt.figure()
    ax = fig.gca(projection='3d')  # 三维坐标轴
    ax.bar3d(X, Y, height, width, depth, Z2, color=c,
             shade=True)  # width, depth, height
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('reward_vale')
    plt.title("预处理后恢复的R")
    plt.show()

    # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换
    fig = plt.figure()
    ax = fig.gca(projection='3d')  # 三维坐标轴
    ax.bar3d(X, Y, height, width, depth, Z3, color=c,
             shade=True)  # width, depth, height
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('reward_vale')
    plt.title("预处理且监督学习恢复的R")
    plt.show()

    # 画误差图
    plt.plot(range(epochs), loss1, color='r', label='未加预处理')
    plt.plot(range(epochs), loss2, color='g', label='加了预处理')
    plt.plot(range(epochs), loss3, color='b', label='预处理且监督学习')
    plt.legend(loc=1)  # 标签展示位置,数字代表标签具位置右上
    plt.xlabel('epochs')
    plt.ylabel('Error')
    plt.title('grid_size=10,discount=0.9')
    plt.plot()
    plt.show()
def test_gw_once(grid_size, feature_map, n_samples, epochs, structure):
    """
    Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature
    map feature_map with n_samples paths.

    grid_size: Grid size. int.
    feature_map: Which feature map to use. String in {ident, coord, proxi}.
    n_samples: Number of paths to sample.
    epochs: Number of epochs to run MaxEnt with.
    structure: Neural network structure tuple, e.g. (3, 3) would be a
        3-layer neural network with assumed inputs.
    -> Expected value difference for MaxEnt, DeepMaxEnt
    """

    # Basic gist of what we're doing here: Get the reward function using our
    # different IRL methods, use those to get a policy, evaluate that policy
    # using the true reward, and then return the difference in expected values.

    # Setup parameters.
    wind = 0.3
    discount = 0.9
    learning_rate = 0.01
    trajectory_length = 3*grid_size

    # Make the gridworld and associated data.
    gw = Gridworld(grid_size, wind, discount)
    feature_matrix = gw.feature_matrix(feature_map)
    ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)])
    optimal_policy = value_iteration.find_policy(gw.n_states,
                                                 gw.n_actions,
                                                 gw.transition_probability,
                                                 ground_reward,
                                                 discount).argmax(axis=1)
    trajectories = gw.generate_trajectories(n_samples,
                                            trajectory_length,
                                            optimal_policy.take)
    p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
                     trajectories.shape[0])

    # True value.
    optimal_V = value_iteration.optimal_value(gw.n_states,
                                              gw.n_actions,
                                              gw.transition_probability,
                                              ground_reward, gw.discount)

    # MaxEnt reward; policy; value.
    maxent_reward = deep_maxent.irl((feature_matrix.shape[1],),
                                    feature_matrix,
                                    gw.n_actions,
                                    gw.discount,
                                    gw.transition_probability,
                                    trajectories, epochs, learning_rate)

    maxent_policy = value_iteration.find_policy(gw.n_states,
                                                gw.n_actions,
                                                gw.transition_probability,
                                                maxent_reward,
                                                discount).argmax(axis=1)
    maxent_V = value_iteration.value(maxent_policy,
                                     gw.n_states,
                                     gw.transition_probability,
                                     ground_reward,
                                     gw.discount)
    maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)

    # DeepMaxEnt reward; policy; value.
    deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure,
                                         feature_matrix,
                                         gw.n_actions,
                                         gw.discount,
                                         gw.transition_probability,
                                         trajectories, epochs, learning_rate)
    deep_maxent_policy = value_iteration.find_policy(gw.n_states,
                                                     gw.n_actions,
                                                     gw.transition_probability,
                                                     deep_maxent_reward,
                                                     discount).argmax(axis=1)
    deep_maxent_V = value_iteration.value(deep_maxent_policy,
                                          gw.n_states,
                                          gw.transition_probability,
                                          ground_reward,
                                          gw.discount)
    deep_maxent_EVD = (optimal_V.dot(p_start_state) -
                       deep_maxent_V.dot(p_start_state))

    plt.subplot(3, 3, 1)
    plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 2)
    plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
    plt.title("MaxEnt reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 3)
    plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)

    plt.subplot(3, 3, 4)
    plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("Optimal policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 5)
    plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("MaxEnt policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 6)
    plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
               vmin=0, vmax=3)
    plt.title("DeepMaxEnt policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)

    plt.subplot(3, 3, 7)
    plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
    plt.title("Optimal value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 8)
    plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
    plt.title("MaxEnt value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 9)
    plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map,
        n_samples, epochs, structure, np.random.randint(10000000)))


    return maxent_EVD, deep_maxent_EVD
示例#13
0
def main(grid_size,
         discount,
         n_objects,
         n_colours,
         n_trajectories,
         epochs,
         learning_rate,
         start_state,
         wind=0.0,
         algo="maxnet",
         mdp="gridworld"):
    """
    Run inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    start_state: start location to generate trajectory from
    algo: IRL algo to run (Currently, support maxnet and deep_maxnet)
    """

    sx, sy = start_state
    trajectory_length = 8

    if mdp == "objectworld":
        import irl.mdp.objectworld as objectworld
        ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                     discount)
    elif mdp == "gridworld":
        import irl.mdp.gridworld as gridworld
        ow = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    optimal_v = optimal_value(ow.n_states,
                              ow.n_actions, ow.transition_probability,
                              normalize(ground_r), ow.discount)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s],
                                            random_start=True)

    feature_matrix = ow.feature_matrix()

    print("trajectories = ", trajectories.shape)
    print("epochs = ", epochs)
    print("feature_matrix.shape = ", feature_matrix.shape)
    print("policy.shape = ", policy.shape)
    #    ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind), value=optimal_v)
    ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                    epochs, wind),
                 policy=policy,
                 value=optimal_v)

    r = []
    ground_svf = []
    if algo == "maxent":
        import irl.maxent as maxent
        ground_svf = maxent.find_svf(ow.n_states, trajectories)
        r = maxent.irl(feature_matrix, ow.n_actions, discount,
                       ow.transition_probability, trajectories, epochs,
                       learning_rate)
    elif algo == "deep_maxnet":
        import irl.deep_maxent as deep_maxent
        l1 = l2 = 0
        structure = (3, 3)
        r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                            feature_matrix,
                            ow.n_actions,
                            discount,
                            ow.transition_probability,
                            trajectories,
                            epochs,
                            learning_rate,
                            l1=l1,
                            l2=l2)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   normalize(r),
                                   ow.discount,
                                   stochastic=False)
    recovered_v = value(recovered_policy, ow.n_states,
                        ow.transition_probability, normalize(r), ow.discount)

    new_trajectory = ow.generate_trajectories(n_trajectories,
                                              trajectory_length,
                                              lambda s: recovered_policy[s],
                                              True, (sx, sy))
    recovered_svf = maxent.find_svf(ow.n_states, new_trajectory)

    #    ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind),
    #                                value=recovered_v)
    ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                 policy=recovered_policy,
                 value=recovered_v)

    #    print("new trajectory")
    #    for t in new_trajectory:
    #        for s, a, rw in t:
    #            print (ow.int_to_point(s), ow.actions[a], rw)
    #        print ("---------")
    y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5]

    plt.subplot(111)

    plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth SVF")
    plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                       epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size)))
    plt.title("Recovered SVF")
    plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size)))
    plt.title("Recovered reward")
    plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)