Python MDP примеры использования

Язык программирования: Python

Пространство имен/Пакет: emdp.common

Класс/Тип: MDP

Примеров на hotexamples.com: 6

Python MDP - 6 примеров найдено. Это лучшие примеры Python кода для emdp.common.MDP, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MDP(6)

step(1)

Основные методы

MDP (6)

step (1)

Пример #1

Показать файл

def build_twostate_MDP():
    """
    MDP with transition probabilities
    P(s_0 | s_0, a_0) = 0.5
    P(s_1 | s_0, a_0) = 0.5
    P(s_0 | s_0, a_1) = 0
    P(s_1 | s_0, a_1) = 1
    P(s_1 | s_0, a_2) = 0
    P(s_1 | s_1, a_2) = 1
    Rewards: r(s_0, a_0) = 5, r(s_0, a_1) = 10, r(s_1, a_2) = -1
    Discount factor : 0.95
    :return:
    """
    P = np.zeros((2, 3, 2))
    P[0, 0] = [0.5, 0.5]
    P[0, 1] = [0, 1]
    P[0, 2] = [1, 0]  # no op
    P[1, 2] = [0, 1]
    P[1, 1] = [0, 1]
    P[1, 0] = [0, 1]
    T = {0: {0: [0.5, 0.5], 1: [0, 1]}, 1: {2: [0, 1]}}
    gamma = 0.9
    R = np.zeros((2, 3))
    R[0, 0] = 5
    R[0, 1] = 10
    R[1, 2] = -1

    return MDP(P, R, gamma, p0=np.array([0.5, 0.5]), terminal_states=[])

Пример #2

Показать файл

def build_imani_counterexample():
    """
    MDP counter example given in Fig 1a of Imani, et al.
    "An Off-policy Policy Gradient Theorem Using Emphatic Weightings."
    Neurips 2018
    :return:
    """
    # |S| = 4, |A| = 2
    STATES = 4
    ACTIONS = 2
    P = np.zeros((STATES, ACTIONS, STATES))
    P[0, 0] = [0, 1, 0, 0]
    P[0, 1] = [0, 0, 1, 0]
    P[1, 0] = [0, 0, 0, 1]
    P[1, 1] = [0, 0, 0, 1]
    P[2, 0] = [0, 0, 0, 1]
    P[2, 1] = [0, 0, 0, 1]
    P[3, 0] = [0, 0, 0, 1]
    P[3, 1] = [0, 0, 0, 1]
    gamma = 0.99999
    R = np.zeros((STATES, ACTIONS))
    R[0, 0] = 0
    R[0, 1] = 1
    R[1, 0] = 2
    R[1, 1] = 0
    R[2, 0] = 0
    R[2, 1] = 1

    return MDP(P, R, gamma, p0=np.array([1, 0, 0, 0]), terminal_states=[3])
    pass

Пример #3

Показать файл

Файл: off_policy.py Проект: zafarali/emdp

def build_two_circle_MDP(discount=0.6, good_reward=10., distractor_reward=5.):
    """MDP counter example given in Fig 1a of Zhang, et al.

    See "Generalized Off-Policy Actor-Critic" https://arxiv.org/pdf/1903.11329.pdf

    :param discount: The discount factor.
    :param good_reward: The good reward that the agent must find.
    :param distractor_reward: The disctraction reward.
    :returns: An emdp.common.MDP object.
    """
    ACTIONS = 2
    STATES = 11
    # Referrence of MDP states as in the paper.
    A = 0
    C = 1
    B = 5
    ACTUAL_REWARD_STATE = 3
    JOINER_STATE = 4

    # State 0 (A) is the starting state
    # States 1 - 3 are states in the first chain.
    FIRST_CHAIN = [C, 2, ACTUAL_REWARD_STATE, JOINER_STATE]
    # States 5 - 7 are states in the second chain.
    SECOND_CHAIN = [B, 6, 7, JOINER_STATE]
    # State 4 joins the two chains.
    # States 8 - 10 are states that lead back to A
    CONNECTION_CHAIN = [JOINER_STATE, 8, 9, 10, A]

    # DEFINING TRANSITION MATRIX.
    P = np.zeros((STATES, ACTIONS, STATES))

    # From the first state, the actions lead to different circumstances.
    P[A, 0, C] = 1.
    P[A, 1, B] = 1.

    # Within the chains, any action should lead to the next state in the chain.
    for chain in [FIRST_CHAIN, SECOND_CHAIN, CONNECTION_CHAIN]:
        for state_t, state_tp1 in zip(chain[:-1], chain[1:]):
            P[state_t, :, state_tp1] = 1.

    # DEFINING DISCOUNT FACTOR.
    gamma = discount

    # DEFINING REWARDS.
    # Both actions lead to the good reward.
    R = np.zeros((STATES, ACTIONS))
    R[ACTUAL_REWARD_STATE, :] = good_reward
    # Both actions lead to the distractor reward.
    R[B, :] = distractor_reward

    # DEFINING START STATES.
    p0 = np.zeros(STATES)
    p0[A] = 1.

    return MDP(P, R, gamma, p0=p0, terminal_states=[])

Пример #4

Показать файл

Файл: action_gap.py Проект: zafarali/emdp

def build_cake_world_mdp(epsilon, discount, cake_reward=1.0):
    r"""Cake world MDP from Action Gap Paper (Fig 1 of Bellemare et al. 2016).

    Increasing the Action Gap: New Operators for Reinforcement Learning.
    https://arxiv.org/pdf/1512.04860.pdf

    The action gap is modulated by epsilon since the difference between Q values
    for each action is given by `Q(x1, a2) - Q(x1, a2) = epsilon`.

    Args:
    :param epsilon: Float epsilon for the action gap.
    :param discount: Float discount factor.
    :param cake_reward: Float reward for eating cake.
    :returns: An emdp.common.MDP object.
    """
    STATES = 2
    ACTIONS = 2

    # Short hand to make following paper easy.
    x1, x2 = 0, 1
    a1, a2 = 0, 1

    P = np.zeros((STATES, ACTIONS, STATES))

    # Taking action a1 in state x1 takes you to x1 or x2 with equal likelihood.
    P[x1, a1, :] = .5

    # Taking the abstain action leads you back to x1.
    P[x1, a2, x1] = 1.

    # All actions from x2 should lead to x2 (Terminal state).
    P[x2, :, x2] = 1.

    # Found by solving for `r` in `V(x2) = r + discount * V(x2)`.
    # -2(1+e)/gamma = r + gamma * -2(1+e)/gamma.
    # Let r = rhat * 1/ gamma.
    # => -2 (1+e) = rhat + -2 * gamma * (1+e).
    # => -2 [ (1+e) - gamma * (1+e)] = rhat
    # => -2 [ (1+e)(1-gamma)] = rhat.
    # ==> r = -2 (1+e)(1-gamma)/gamma.
    forever_reward = -2.0 * (1 + epsilon) * (1 - discount) / discount

    R = np.zeros((STATES, ACTIONS))
    R[x2, :] = forever_reward  # Small negative forever reward.
    R[x1, a1] = cake_reward  # Cake!
    R[x1, a2] = 0.  # Abstain cake!

    p0 = np.array([1.0, 0.0])

    return MDP(P, R, discount, p0=p0, terminal_states=[x2])

Пример #5

Показать файл

Файл: mdp.py Проект: d3sm0/pg

def get_shamdp(horizon=20, c=1.6):
    horizon = 4
    gamma = horizon / (horizon + 1)
    n_actions = 4
    # modified MDP
    # going right except in the absorbing state incurs a penalty
    penalty = -gamma**(horizon // c)
    print(penalty)
    P, r = build_chain_mdp(horizon, n_actions, 1,
                           penalty)  # same transition dynamics as original MDP
    n_states = P.shape[0]

    initial_state_distribution = np.zeros(n_states)
    # initial_state_distribution /= n_states
    initial_state_distribution[0] = 1.
    # optimal policy of this MDP
    from emdp.common import MDP
    return MDP(P,
               r,
               gamma,
               initial_state_distribution,
               terminal_states=[n_states - 2])

Пример #6

Показать файл

Файл: test_mdp_functions.py Проект: zafarali/emdp

def test_simple_reset_MDP():
    P = np.array([
        [[1, 0],
         [0, 1]],  # LEFT action results in the same state, RIGHT next state.
        [[0, 1], [0, 1]]
    ])  # from terminal state, any action goes to the same state.
    p0 = np.array([1, 0])
    R = np.array([
        [0, 5],  # RIGHT action from state 0 gives +5 reward.
        [0, 0]
    ])
    # 2 state MDP
    # where transitioning into state 1 will terminate and get +1 reward
    mdp = MDP(P, R, 0.9, p0, [1])

    # check if we are indeed in the starting state:
    assert np.all(np.equal(mdp.current_state, np.array([1, 0])))

    # simulate an episode
    state, reward, done, _ = mdp.step(0)  # left step, no end
    assert np.all(np.equal(state, np.array([1, 0])))
    assert reward == 0
    assert not done

    # simulate another step
    state, reward, done, _ = mdp.step(1)  # right step
    assert np.all(np.equal(state, np.array([0, 1])))
    assert reward == +5
    assert not done

    # simulate another step (should return done)
    state, reward, done, _ = mdp.step(1)  # right step
    assert np.all(np.equal(state, np.array([0, 1])))
    assert reward == 0
    assert done

    try:
        mdp.step(0)
        assert False, 'This should throw an EpisodeDoneError'
    except EpisodeDoneError:
        assert True