Exemplo n.º 1
0
def expected_value_difference(n_states, n_actions, transition_probability,
                              reward, discount, p_start_state, optimal_value,
                              true_reward):
    """
    Calculate the expected value difference, which is a proxy to how good a
    recovered reward function is.

    n_states: Number of states. int.
    n_actions: Number of actions. int.
    transition_probability: NumPy array mapping (state_i, action, state_k) to
        the probability of transitioning from state_i to state_k under action.
        Shape (N, A, N).
    reward: Reward vector mapping state int to reward. Shape (N,).
    discount: Discount factor. float.
    p_start_state: Probability vector with the ith component as the probability
        that the ith state is the start state. Shape (N,).
    optimal_value: Value vector for the ground reward with optimal policy.
        The ith component is the value of the ith state. Shape (N,).
    true_reward: True reward vector. Shape (N,).
    -> Expected value difference. float.
    """

    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, reward,
                                         discount)
    value = value_iteration.value(policy.argmax(axis=1), n_states,
                                  transition_probability, true_reward,
                                  discount)

    evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
    return evd
Exemplo n.º 2
0
def expected_value_difference(n_states, n_actions, transition_probability,
                              reward, discount, p_start_state, optimal_value,
                              true_reward):
    """
    Calculate the expected value difference, which is a proxy to how good a
    recovered reward function is.
    """

    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, reward,
                                         discount)
    value = value_iteration.value(policy.argmax(axis=1), n_states,
                                  transition_probability, true_reward,
                                  discount)

    evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
    return evd