def build_twostate_MDP(): """ MDP with transition probabilities P(s_0 | s_0, a_0) = 0.5 P(s_1 | s_0, a_0) = 0.5 P(s_0 | s_0, a_1) = 0 P(s_1 | s_0, a_1) = 1 P(s_1 | s_0, a_2) = 0 P(s_1 | s_1, a_2) = 1 Rewards: r(s_0, a_0) = 5, r(s_0, a_1) = 10, r(s_1, a_2) = -1 Discount factor : 0.95 :return: """ P = np.zeros((2, 3, 2)) P[0, 0] = [0.5, 0.5] P[0, 1] = [0, 1] P[0, 2] = [1, 0] # no op P[1, 2] = [0, 1] P[1, 1] = [0, 1] P[1, 0] = [0, 1] T = {0: {0: [0.5, 0.5], 1: [0, 1]}, 1: {2: [0, 1]}} gamma = 0.9 R = np.zeros((2, 3)) R[0, 0] = 5 R[0, 1] = 10 R[1, 2] = -1 return MDP(P, R, gamma, p0=np.array([0.5, 0.5]), terminal_states=[])
def build_imani_counterexample(): """ MDP counter example given in Fig 1a of Imani, et al. "An Off-policy Policy Gradient Theorem Using Emphatic Weightings." Neurips 2018 :return: """ # |S| = 4, |A| = 2 STATES = 4 ACTIONS = 2 P = np.zeros((STATES, ACTIONS, STATES)) P[0, 0] = [0, 1, 0, 0] P[0, 1] = [0, 0, 1, 0] P[1, 0] = [0, 0, 0, 1] P[1, 1] = [0, 0, 0, 1] P[2, 0] = [0, 0, 0, 1] P[2, 1] = [0, 0, 0, 1] P[3, 0] = [0, 0, 0, 1] P[3, 1] = [0, 0, 0, 1] gamma = 0.99999 R = np.zeros((STATES, ACTIONS)) R[0, 0] = 0 R[0, 1] = 1 R[1, 0] = 2 R[1, 1] = 0 R[2, 0] = 0 R[2, 1] = 1 return MDP(P, R, gamma, p0=np.array([1, 0, 0, 0]), terminal_states=[3]) pass
def build_two_circle_MDP(discount=0.6, good_reward=10., distractor_reward=5.): """MDP counter example given in Fig 1a of Zhang, et al. See "Generalized Off-Policy Actor-Critic" https://arxiv.org/pdf/1903.11329.pdf :param discount: The discount factor. :param good_reward: The good reward that the agent must find. :param distractor_reward: The disctraction reward. :returns: An emdp.common.MDP object. """ ACTIONS = 2 STATES = 11 # Referrence of MDP states as in the paper. A = 0 C = 1 B = 5 ACTUAL_REWARD_STATE = 3 JOINER_STATE = 4 # State 0 (A) is the starting state # States 1 - 3 are states in the first chain. FIRST_CHAIN = [C, 2, ACTUAL_REWARD_STATE, JOINER_STATE] # States 5 - 7 are states in the second chain. SECOND_CHAIN = [B, 6, 7, JOINER_STATE] # State 4 joins the two chains. # States 8 - 10 are states that lead back to A CONNECTION_CHAIN = [JOINER_STATE, 8, 9, 10, A] # DEFINING TRANSITION MATRIX. P = np.zeros((STATES, ACTIONS, STATES)) # From the first state, the actions lead to different circumstances. P[A, 0, C] = 1. P[A, 1, B] = 1. # Within the chains, any action should lead to the next state in the chain. for chain in [FIRST_CHAIN, SECOND_CHAIN, CONNECTION_CHAIN]: for state_t, state_tp1 in zip(chain[:-1], chain[1:]): P[state_t, :, state_tp1] = 1. # DEFINING DISCOUNT FACTOR. gamma = discount # DEFINING REWARDS. # Both actions lead to the good reward. R = np.zeros((STATES, ACTIONS)) R[ACTUAL_REWARD_STATE, :] = good_reward # Both actions lead to the distractor reward. R[B, :] = distractor_reward # DEFINING START STATES. p0 = np.zeros(STATES) p0[A] = 1. return MDP(P, R, gamma, p0=p0, terminal_states=[])
def build_cake_world_mdp(epsilon, discount, cake_reward=1.0): r"""Cake world MDP from Action Gap Paper (Fig 1 of Bellemare et al. 2016). Increasing the Action Gap: New Operators for Reinforcement Learning. https://arxiv.org/pdf/1512.04860.pdf The action gap is modulated by epsilon since the difference between Q values for each action is given by `Q(x1, a2) - Q(x1, a2) = epsilon`. Args: :param epsilon: Float epsilon for the action gap. :param discount: Float discount factor. :param cake_reward: Float reward for eating cake. :returns: An emdp.common.MDP object. """ STATES = 2 ACTIONS = 2 # Short hand to make following paper easy. x1, x2 = 0, 1 a1, a2 = 0, 1 P = np.zeros((STATES, ACTIONS, STATES)) # Taking action a1 in state x1 takes you to x1 or x2 with equal likelihood. P[x1, a1, :] = .5 # Taking the abstain action leads you back to x1. P[x1, a2, x1] = 1. # All actions from x2 should lead to x2 (Terminal state). P[x2, :, x2] = 1. # Found by solving for `r` in `V(x2) = r + discount * V(x2)`. # -2(1+e)/gamma = r + gamma * -2(1+e)/gamma. # Let r = rhat * 1/ gamma. # => -2 (1+e) = rhat + -2 * gamma * (1+e). # => -2 [ (1+e) - gamma * (1+e)] = rhat # => -2 [ (1+e)(1-gamma)] = rhat. # ==> r = -2 (1+e)(1-gamma)/gamma. forever_reward = -2.0 * (1 + epsilon) * (1 - discount) / discount R = np.zeros((STATES, ACTIONS)) R[x2, :] = forever_reward # Small negative forever reward. R[x1, a1] = cake_reward # Cake! R[x1, a2] = 0. # Abstain cake! p0 = np.array([1.0, 0.0]) return MDP(P, R, discount, p0=p0, terminal_states=[x2])
def test_simple_reset_MDP(): P = np.array([ [[1, 0], [0, 1]], # LEFT action results in the same state, RIGHT next state. [[0, 1], [0, 1]] ]) # from terminal state, any action goes to the same state. p0 = np.array([1, 0]) R = np.array([ [0, 5], # RIGHT action from state 0 gives +5 reward. [0, 0] ]) # 2 state MDP # where transitioning into state 1 will terminate and get +1 reward mdp = MDP(P, R, 0.9, p0, [1]) # check if we are indeed in the starting state: assert np.all(np.equal(mdp.current_state, np.array([1, 0]))) # simulate an episode state, reward, done, _ = mdp.step(0) # left step, no end assert np.all(np.equal(state, np.array([1, 0]))) assert reward == 0 assert not done # simulate another step state, reward, done, _ = mdp.step(1) # right step assert np.all(np.equal(state, np.array([0, 1]))) assert reward == +5 assert not done # simulate another step (should return done) state, reward, done, _ = mdp.step(1) # right step assert np.all(np.equal(state, np.array([0, 1]))) assert reward == 0 assert done try: mdp.step(0) assert False, 'This should throw an EpisodeDoneError' except EpisodeDoneError: assert True
def get_shamdp(horizon=20, c=1.6): horizon = 4 gamma = horizon / (horizon + 1) n_actions = 4 # modified MDP # going right except in the absorbing state incurs a penalty penalty = -gamma**(horizon // c) print(penalty) P, r = build_chain_mdp(horizon, n_actions, 1, penalty) # same transition dynamics as original MDP n_states = P.shape[0] initial_state_distribution = np.zeros(n_states) # initial_state_distribution /= n_states initial_state_distribution[0] = 1. # optimal policy of this MDP from emdp.common import MDP return MDP(P, r, gamma, initial_state_distribution, terminal_states=[n_states - 2])