Пример #1
0
def build_twostate_MDP():
    """
    MDP with transition probabilities
    P(s_0 | s_0, a_0) = 0.5
    P(s_1 | s_0, a_0) = 0.5
    P(s_0 | s_0, a_1) = 0
    P(s_1 | s_0, a_1) = 1
    P(s_1 | s_0, a_2) = 0
    P(s_1 | s_1, a_2) = 1
    Rewards: r(s_0, a_0) = 5, r(s_0, a_1) = 10, r(s_1, a_2) = -1
    Discount factor : 0.95
    :return:
    """
    P = np.zeros((2, 3, 2))
    P[0, 0] = [0.5, 0.5]
    P[0, 1] = [0, 1]
    P[0, 2] = [1, 0]  # no op
    P[1, 2] = [0, 1]
    P[1, 1] = [0, 1]
    P[1, 0] = [0, 1]
    T = {0: {0: [0.5, 0.5], 1: [0, 1]}, 1: {2: [0, 1]}}
    gamma = 0.9
    R = np.zeros((2, 3))
    R[0, 0] = 5
    R[0, 1] = 10
    R[1, 2] = -1

    return MDP(P, R, gamma, p0=np.array([0.5, 0.5]), terminal_states=[])
Пример #2
0
def build_imani_counterexample():
    """
    MDP counter example given in Fig 1a of Imani, et al.
    "An Off-policy Policy Gradient Theorem Using Emphatic Weightings."
    Neurips 2018
    :return:
    """
    # |S| = 4, |A| = 2
    STATES = 4
    ACTIONS = 2
    P = np.zeros((STATES, ACTIONS, STATES))
    P[0, 0] = [0, 1, 0, 0]
    P[0, 1] = [0, 0, 1, 0]
    P[1, 0] = [0, 0, 0, 1]
    P[1, 1] = [0, 0, 0, 1]
    P[2, 0] = [0, 0, 0, 1]
    P[2, 1] = [0, 0, 0, 1]
    P[3, 0] = [0, 0, 0, 1]
    P[3, 1] = [0, 0, 0, 1]
    gamma = 0.99999
    R = np.zeros((STATES, ACTIONS))
    R[0, 0] = 0
    R[0, 1] = 1
    R[1, 0] = 2
    R[1, 1] = 0
    R[2, 0] = 0
    R[2, 1] = 1

    return MDP(P, R, gamma, p0=np.array([1, 0, 0, 0]), terminal_states=[3])
    pass
Пример #3
0
def build_two_circle_MDP(discount=0.6, good_reward=10., distractor_reward=5.):
    """MDP counter example given in Fig 1a of Zhang, et al.

    See "Generalized Off-Policy Actor-Critic" https://arxiv.org/pdf/1903.11329.pdf

    :param discount: The discount factor.
    :param good_reward: The good reward that the agent must find.
    :param distractor_reward: The disctraction reward.
    :returns: An emdp.common.MDP object.
    """
    ACTIONS = 2
    STATES = 11
    # Referrence of MDP states as in the paper.
    A = 0
    C = 1
    B = 5
    ACTUAL_REWARD_STATE = 3
    JOINER_STATE = 4

    # State 0 (A) is the starting state
    # States 1 - 3 are states in the first chain.
    FIRST_CHAIN = [C, 2, ACTUAL_REWARD_STATE, JOINER_STATE]
    # States 5 - 7 are states in the second chain.
    SECOND_CHAIN = [B, 6, 7, JOINER_STATE]
    # State 4 joins the two chains.
    # States 8 - 10 are states that lead back to A
    CONNECTION_CHAIN = [JOINER_STATE, 8, 9, 10, A]

    # DEFINING TRANSITION MATRIX.
    P = np.zeros((STATES, ACTIONS, STATES))

    # From the first state, the actions lead to different circumstances.
    P[A, 0, C] = 1.
    P[A, 1, B] = 1.

    # Within the chains, any action should lead to the next state in the chain.
    for chain in [FIRST_CHAIN, SECOND_CHAIN, CONNECTION_CHAIN]:
        for state_t, state_tp1 in zip(chain[:-1], chain[1:]):
            P[state_t, :, state_tp1] = 1.

    # DEFINING DISCOUNT FACTOR.
    gamma = discount

    # DEFINING REWARDS.
    # Both actions lead to the good reward.
    R = np.zeros((STATES, ACTIONS))
    R[ACTUAL_REWARD_STATE, :] = good_reward
    # Both actions lead to the distractor reward.
    R[B, :] = distractor_reward

    # DEFINING START STATES.
    p0 = np.zeros(STATES)
    p0[A] = 1.

    return MDP(P, R, gamma, p0=p0, terminal_states=[])
Пример #4
0
def build_cake_world_mdp(epsilon, discount, cake_reward=1.0):
    r"""Cake world MDP from Action Gap Paper (Fig 1 of Bellemare et al. 2016).

    Increasing the Action Gap: New Operators for Reinforcement Learning.
    https://arxiv.org/pdf/1512.04860.pdf

    The action gap is modulated by epsilon since the difference between Q values
    for each action is given by `Q(x1, a2) - Q(x1, a2) = epsilon`.

    Args:
    :param epsilon: Float epsilon for the action gap.
    :param discount: Float discount factor.
    :param cake_reward: Float reward for eating cake.
    :returns: An emdp.common.MDP object.
    """
    STATES = 2
    ACTIONS = 2

    # Short hand to make following paper easy.
    x1, x2 = 0, 1
    a1, a2 = 0, 1

    P = np.zeros((STATES, ACTIONS, STATES))

    # Taking action a1 in state x1 takes you to x1 or x2 with equal likelihood.
    P[x1, a1, :] = .5

    # Taking the abstain action leads you back to x1.
    P[x1, a2, x1] = 1.

    # All actions from x2 should lead to x2 (Terminal state).
    P[x2, :, x2] = 1.

    # Found by solving for `r` in `V(x2) = r + discount * V(x2)`.
    # -2(1+e)/gamma = r + gamma * -2(1+e)/gamma.
    # Let r = rhat * 1/ gamma.
    # => -2 (1+e) = rhat + -2 * gamma * (1+e).
    # => -2 [ (1+e) - gamma * (1+e)] = rhat
    # => -2 [ (1+e)(1-gamma)] = rhat.
    # ==> r = -2 (1+e)(1-gamma)/gamma.
    forever_reward = -2.0 * (1 + epsilon) * (1 - discount) / discount

    R = np.zeros((STATES, ACTIONS))
    R[x2, :] = forever_reward  # Small negative forever reward.
    R[x1, a1] = cake_reward  # Cake!
    R[x1, a2] = 0.  # Abstain cake!

    p0 = np.array([1.0, 0.0])

    return MDP(P, R, discount, p0=p0, terminal_states=[x2])
Пример #5
0
Файл: mdp.py Проект: d3sm0/pg
def get_shamdp(horizon=20, c=1.6):
    horizon = 4
    gamma = horizon / (horizon + 1)
    n_actions = 4
    # modified MDP
    # going right except in the absorbing state incurs a penalty
    penalty = -gamma**(horizon // c)
    print(penalty)
    P, r = build_chain_mdp(horizon, n_actions, 1,
                           penalty)  # same transition dynamics as original MDP
    n_states = P.shape[0]

    initial_state_distribution = np.zeros(n_states)
    # initial_state_distribution /= n_states
    initial_state_distribution[0] = 1.
    # optimal policy of this MDP
    from emdp.common import MDP
    return MDP(P,
               r,
               gamma,
               initial_state_distribution,
               terminal_states=[n_states - 2])
Пример #6
0
def test_simple_reset_MDP():
    P = np.array([
        [[1, 0],
         [0, 1]],  # LEFT action results in the same state, RIGHT next state.
        [[0, 1], [0, 1]]
    ])  # from terminal state, any action goes to the same state.
    p0 = np.array([1, 0])
    R = np.array([
        [0, 5],  # RIGHT action from state 0 gives +5 reward.
        [0, 0]
    ])
    # 2 state MDP
    # where transitioning into state 1 will terminate and get +1 reward
    mdp = MDP(P, R, 0.9, p0, [1])

    # check if we are indeed in the starting state:
    assert np.all(np.equal(mdp.current_state, np.array([1, 0])))

    # simulate an episode
    state, reward, done, _ = mdp.step(0)  # left step, no end
    assert np.all(np.equal(state, np.array([1, 0])))
    assert reward == 0
    assert not done

    # simulate another step
    state, reward, done, _ = mdp.step(1)  # right step
    assert np.all(np.equal(state, np.array([0, 1])))
    assert reward == +5
    assert not done

    # simulate another step (should return done)
    state, reward, done, _ = mdp.step(1)  # right step
    assert np.all(np.equal(state, np.array([0, 1])))
    assert reward == 0
    assert done

    try:
        mdp.step(0)
        assert False, 'This should throw an EpisodeDoneError'
    except EpisodeDoneError:
        assert True