Exemplo n.º 1
0
def test_rescale_reward():
    # tolerance
    tol = 1e-14

    rng = Seeder(123).rng

    for _ in range(10):
        # generate random MDP
        S, A = 5, 2
        R = rng.uniform(0.0, 1.0, (S, A))
        P = rng.uniform(0.0, 1.0, (S, A, S))
        for ss in range(S):
            for aa in range(A):
                P[ss, aa, :] /= P[ss, aa, :].sum()
        env = FiniteMDP(R, P)

        # test
        wrapped = RescaleRewardWrapper(env, (-10, 10))
        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.sample(
                wrapped.observation_space.sample(),
                wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol

        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.step(wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol
Exemplo n.º 2
0
def test_value_iteration_agent(horizon, gamma, S, A):
    for sim in range(5):
        # generate random MDP
        R, P = get_random_mdp(S, A)
        # create env and agent
        env = FiniteMDP(R, P)
        agent = ValueIterationAgent(env, gamma=gamma, horizon=horizon)
        # run
        agent.fit()
Exemplo n.º 3
0
    def __init__(self, L=5, fail_prob=0.1):
        assert L >= 2
        self.L = L
        self.fail_prob = fail_prob

        # transition probabilities
        P = np.zeros((L, 2, L))
        for ss in range(L):
            for _ in range(2):
                if ss == 0:
                    P[ss, 0, ss] = 1.0 - fail_prob  # action 0 = don't move
                    P[ss, 1, ss + 1] = 1.0 - fail_prob  # action 1 = right
                    P[ss, 0, ss + 1] = fail_prob
                    P[ss, 1, ss] = fail_prob
                elif ss == L - 1:
                    P[ss, 0, ss - 1] = 1.0 - fail_prob  # action 0 = left
                    P[ss, 1, ss] = 1.0 - fail_prob  # action 1 = don't move
                    P[ss, 0, ss] = fail_prob
                    P[ss, 1, ss - 1] = fail_prob
                else:
                    P[ss, 0, ss - 1] = 1.0 - fail_prob  # action 0 = left
                    P[ss, 1, ss + 1] = 1.0 - fail_prob  # action 1 = right
                    P[ss, 0, ss + 1] = fail_prob
                    P[ss, 1, ss - 1] = fail_prob

                    # mean reward
        S = L
        A = 2
        R = np.zeros((S, A))
        R[L - 1, :] = 1.0
        R[0, :] = 0.05

        # init base classes
        FiniteMDP.__init__(self, R, P, initial_state_distribution=0)
        RenderInterface2D.__init__(self)
        self.reward_range = (0.0, 1.0)

        # rendering info
        self.set_clipping_area((0, L, 0, 1))
        self.set_refresh_interval(100)  # in milliseconds
Exemplo n.º 4
0
def test_mbqvi(S, A):
    rng = Seeder(123).rng

    for sim in range(5):
        # generate random MDP with deterministic transitions
        R = rng.uniform(0.0, 1.0, (S, A))
        P = np.zeros((S, A, S))
        for ss in range(S):
            for aa in range(A):
                ns = rng.integers(0, S)
                P[ss, aa, ns] = 1

        # run MBQVI and check exactness of estimators
        env = FiniteMDP(R, P)
        agent = MBQVIAgent(env, n_samples=1)
        agent.fit()
        assert np.abs(R - agent.R_hat).max() < 1e-16
        assert np.abs(P - agent.P_hat).max() < 1e-16
Exemplo n.º 5
0
def test_autoreset(horizon):
    # dummy MDP
    S, A = 5, 2
    R = np.ones((S, A))
    P = np.ones((S, A, S))
    for ss in range(S):
        for aa in range(A):
            P[ss, aa, :] /= P[ss, aa, :].sum()
    # initial state = 3
    env = FiniteMDP(R, P, initial_state_distribution=3)
    env = AutoResetWrapper(env, horizon)

    env.reset()
    for tt in range(5 * horizon + 1):
        action = env.action_space.sample()
        next_s, reward, done, info = env.step(action)
        if (tt + 1) % horizon == 0:
            assert next_s == 3
Exemplo n.º 6
0
def test_rescale_reward_2(rmin, rmax):
    # tolerance
    tol = 1e-15

    # dummy MDP
    S, A = 5, 2
    R = np.ones((S, A))
    P = np.ones((S, A, S))
    for ss in range(S):
        for aa in range(A):
            P[ss, aa, :] /= P[ss, aa, :].sum()
    env = FiniteMDP(R, P)

    # test bounded case
    env.reward_range = (-100, 50)
    wrapped = RescaleRewardWrapper(env, (rmin, rmax))
    xx = np.linspace(-100, 50, num=100)
    for x in xx:
        y = wrapped._rescale(x)
        assert y >= rmin - tol and y <= rmax + tol

    # test unbounded above
    env.reward_range = (-1.0, np.inf)
    wrapped = RescaleRewardWrapper(env, (rmin, rmax))
    xx = np.linspace(-1, 1e2, num=100)
    for x in xx:
        y = wrapped._rescale(x)
        assert y >= rmin - tol and y <= rmax + tol

    # test unbounded below
    env.reward_range = (-np.inf, 1.0)
    wrapped = RescaleRewardWrapper(env, (rmin, rmax))
    xx = np.linspace(-1e2, 1, num=100)
    for x in xx:
        y = wrapped._rescale(x)
        assert y >= rmin - tol and y <= rmax + tol

    # test unbounded
    env.reward_range = (-np.inf, np.inf)
    wrapped = RescaleRewardWrapper(env, (rmin, rmax))
    xx = np.linspace(-1e2, 1e2, num=200)
    for x in xx:
        y = wrapped._rescale(x)
        assert y >= rmin - tol and y <= rmax + tol
Exemplo n.º 7
0
    def __init__(self,
                 nrows=5,
                 ncols=5,
                 start_coord=(0, 0),
                 terminal_states=None,
                 success_probability=0.9,
                 reward_at=None,
                 walls=((1, 1), (2, 2)),
                 default_reward=0.0):
        # Grid dimensions
        self.nrows = nrows
        self.ncols = ncols

        # Reward parameters
        self.default_reward = default_reward

        # Default config
        if reward_at is not None:
            self.reward_at = reward_at
            if isinstance(next(iter(self.reward_at.keys())), str):
                self.reward_at = {
                    eval(key): value
                    for key, value in self.reward_at.items()
                }
        else:
            self.reward_at = {(nrows - 1, ncols - 1): 1}
        if walls is not None:
            self.walls = walls
        else:
            self.walls = ()
        if terminal_states is not None:
            self.terminal_states = terminal_states
        else:
            self.terminal_states = ((nrows - 1, ncols - 1), )

        # Probability of going left/right/up/down when choosing the
        # correspondent action
        # The remaining probability mass is distributed uniformly to other
        # available actions
        self.success_probability = success_probability

        # Start coordinate
        self.start_coord = tuple(start_coord)

        # Actions (string to index & index to string)
        self.a_str2idx = {'left': 0, 'right': 1, 'up': 2, 'down': 3}
        self.a_idx2str = {0: 'left', 1: 'right', 2: 'up', 3: 'down'}

        # --------------------------------------------
        # The variables below are defined in _build()
        # --------------------------------------------

        # Mappings (state index) <-> (state coordinate)
        self.index2coord = {}
        self.coord2index = {}

        # MDP parameters for base class
        self.P = None
        self.R = None
        self.Ns = None
        self.Na = 4

        # Build
        self._build()
        init_state_coord = self.coord2index[start_coord]
        FiniteMDP.__init__(self,
                           self.R,
                           self.P,
                           initial_state_distribution=init_state_coord)
        RenderInterface2D.__init__(self)
        self.reset()
        self.reward_range = (self.R.min(), self.R.max())

        # rendering info
        self.set_clipping_area((0, self.ncols, 0, self.nrows))
        self.set_refresh_interval(100)  # in milliseconds
        self.renderer_type = 'pygame'