Пример #1
0
    def __init__(self,
                 reward_free=False,
                 difficulty=0,
                 array_observation=False):
        self.reward_free = reward_free
        self.difficulty = difficulty
        self.array_observation = array_observation

        if difficulty not in [0, 1, 2]:
            raise ValueError("FourRoom difficulty must be in [0, 1, 2]")

        # Common parameters
        nrows = 9
        ncols = 9
        start_coord = (0, 0)
        terminal_states = ((8, 0), )
        success_probability = 0.95
        #
        walls = ()
        for ii in range(9):
            if ii not in [2, 6]:
                walls += ((ii, 4), )
        for jj in range(9):
            if jj != 7:
                walls += ((4, jj), )

        # Default reward according to the difficulty
        if difficulty in [0, 1]:
            default_reward = 0.0
        elif difficulty == 2:
            default_reward = -0.005

        # Rewards according to the difficulty
        if self.reward_free:
            reward_at = {}
        else:
            if difficulty == 0:
                reward_at = {(8, 0): 1.0}
            elif difficulty in [1, 2]:
                reward_at = {
                    (8, 0): 1.0,
                    (3, 3): 0.1,
                }

        # Init base class
        GridWorld.__init__(
            self,
            nrows=nrows,
            ncols=ncols,
            start_coord=start_coord,
            terminal_states=terminal_states,
            success_probability=success_probability,
            reward_at=reward_at,
            walls=walls,
            default_reward=default_reward,
        )

        # spaces
        if self.array_observation:
            self.observation_space = spaces.Box(0.0, 1.0, shape=(2, ))
Пример #2
0
def test_lsvi_random_exploration():
    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)
    env.reseed(123)

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(
        env,
        feature_map_fn=feature_map_fn,
        horizon=20,
        gamma=0.99,
        reg_factor=1e-5,
        bonus_scale_factor=0.0,
    )
    agent.reseed(123)
    agent.fit(budget=250)

    # estimated Q
    S = env.observation_space.n
    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    print(Q)
    print("---")
    print(Q_est)

    print("-------")
    print(np.abs(Q - Q_est))
    # Check error
    assert np.abs(Q - Q_est).mean() < 0.1
Пример #3
0
    def __init__(self, reward_free=False, array_observation=False):
        self.reward_free = reward_free
        self.array_observation = array_observation

        # Common parameters
        nrows = 13
        ncols = 17
        start_coord = (5, 1)
        terminal_states = ((7, 7),)
        success_probability = 0.95
        #
        walls = ()
        for ii in range(13):
            walls += ((ii, 0),)
            walls += ((ii, 16),)
        for jj in range(17):
            walls += ((0, jj),)
            walls += ((12, jj),)
        for ii in range(13):
            if ii not in [1, 11]:
                walls += ((ii, 6),)
                walls += ((ii, 10),)
        walls += ((11, 6),)
        for jj in range(17):
            if jj not in [1, 15]:
                walls += ((6, jj),)

        # Default reward according to the difficulty
        default_reward = 0

        # Rewards according to the difficulty
        if self.reward_free:
            reward_at = {}
        else:
            reward_at = {
                        (7, 7): 10.0,
                        (8, 2): 1.0,
                        (10, 3): 1.0
                        }
            for jj in range(7, 16):
                for ii in range(1, 12):
                    if (ii, jj) not in walls and (ii, jj) != (7, 7):
                        reward_at[(ii, jj)] = -0.05

        # Init base class
        GridWorld.__init__(self,
                           nrows=nrows,
                           ncols=ncols,
                           start_coord=start_coord,
                           terminal_states=terminal_states,
                           success_probability=success_probability,
                           reward_at=reward_at,
                           walls=walls,
                           default_reward=default_reward)

        # spaces
        if self.array_observation:
            self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
Пример #4
0
def test_gridworld_aux_functions():
    env = GridWorld(nrows=5,
                    ncols=5,
                    walls=((1, 1), ),
                    reward_at={
                        (4, 4): 1,
                        (4, 3): -1
                    })
    env.log()  # from FiniteMDP
    env.render_ascii()  # from GridWorld
    vals = np.ones(env.observation_space.n)
    env.display_values(vals)
    env.print_transition_at(0, 0, 'up')
Пример #5
0
    def __init__(self, reward_free=False, array_observation=False):
        self.reward_free = reward_free
        self.array_observation = array_observation

        # Common parameters
        nrows = 11
        ncols = 17
        start_coord = (0, 0)
        terminal_states = ((10, 0), )
        success_probability = 0.95
        #
        walls = ()
        for ii in range(11):
            if ii not in [2, 8]:
                walls += ((ii, 5), )
                walls += ((ii, 11), )
        for jj in range(17):
            if jj != 15:
                walls += ((5, jj), )

        # Default reward according to the difficulty
        default_reward = -0.001

        # Rewards according to the difficulty
        if self.reward_free:
            reward_at = {}
        else:
            reward_at = {
                (10, 0): 10.0,
                (4, 4): 0.1,
            }

        # Init base class
        GridWorld.__init__(
            self,
            nrows=nrows,
            ncols=ncols,
            start_coord=start_coord,
            terminal_states=terminal_states,
            success_probability=success_probability,
            reward_at=reward_at,
            walls=walls,
            default_reward=default_reward,
        )

        # spaces
        if self.array_observation:
            self.observation_space = spaces.Box(0.0, 1.0, shape=(2, ))
Пример #6
0
def test_lsvi_optimism():
    env = GridWorld(nrows=2, ncols=2, walls=())

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=250, gamma=0.99,
                         feature_map_fn=feature_map_fn,
                         horizon=3,
                         bonus_scale_factor=3,
                         reg_factor=0.000001)
    agent.fit()

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=3)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    # optimistic Q
    S = env.observation_space.n
    A = env.action_space.n
    Q_optimistic = np.zeros((S, A))
    for ss in range(S):
        Q_optimistic[ss, :] = agent._compute_q_vec(
                                    agent.w_vec[0, :],
                                    ss,
                                    agent.bonus_scale_factor)

    print(Q)
    print(Q_optimistic)
    assert (Q_optimistic - Q).min() >= -1e-5
Пример #7
0
def test_lsvi_ucb_matrix_inversion(FeatMapClass):
    env = GridWorld(nrows=3, ncols=3, walls=())

    def feature_map_fn(_env):
        return FeatMapClass(_env.observation_space.n, _env.action_space.n)

    reg_factor = 0.1
    agent = LSVIUCBAgent(env, n_episodes=50,
                         feature_map_fn=feature_map_fn,
                         horizon=10,
                         reg_factor=reg_factor)
    agent.fit()
    assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv)
    assert agent.episode == 50
    agent.policy(env.observation_space.sample())

    # Check counts
    if FeatMapClass != OneHotFeatureMap:
        return

    S = env.observation_space.n
    A = env.action_space.n
    N_sa = np.zeros((S, A))
    for state, action in zip(agent.state_hist, agent.action_hist):
        N_sa[state, action] += 1.0

    assert np.allclose(agent.lambda_mat_inv.diagonal(),
                       1.0/(N_sa.flatten()+reg_factor))

    for ss in range(S):
        for aa in range(A):
            feat = agent.feature_map.map(ss, aa)
            assert np.allclose(feat @ (agent.lambda_mat_inv.T @ feat),
                               1.0/(N_sa[ss, aa]+reg_factor))
Пример #8
0
def test_rlsvi(gamma, stage_dependent):
    env = GridWorld(walls=(), nrows=5, ncols=5)
    agent = RLSVIAgent(env,
                       horizon=11,
                       stage_dependent=stage_dependent,
                       gamma=gamma)
    agent.fit(budget=50)
    agent.policy(env.observation_space.sample())
Пример #9
0
def test_optql():
    env = GridWorld(walls=(), nrows=5, ncols=5)
    agent = OptQLAgent(env,
                       n_episodes=50,
                       horizon=11,
                       gamma=0.99,
                       bonus_scale_factor=0.1)
    agent.fit()
    agent.policy(env.observation_space.sample())
Пример #10
0
def test_gridworld_from_layout():
    layout = """
    IOOOO # OOOOO  O OOOOR
    OOOOO # OOOOO  # OOOOO
    OOOOO O OOOOO  # OOTOO
    OOOOO # OOOOO  # OOOOO
    IOOOO # OOOOO  # OOOOr"""
    env = GridWorld.from_layout(layout)
    env.reset()
Пример #11
0
def test_discrete2onehot():
    env = DiscreteToOneHotWrapper(GridWorld())
    env.reseed(123)
    assert isinstance(env.observation_space, spaces.Box)
    for ii in range(env.unwrapped.observation_space.n):
        initial_distr = np.zeros(env.unwrapped.observation_space.n)
        initial_distr[ii] = 1.0
        env.unwrapped.set_initial_state_distribution(initial_distr)
        obs = env.reset()
        assert np.array_equal(obs, initial_distr)
Пример #12
0
def test_ucbvi(gamma, stage_dependent, real_time_dp):
    env = GridWorld(walls=(), nrows=5, ncols=5)
    agent = UCBVIAgent(env,
                       n_episodes=50,
                       horizon=11,
                       stage_dependent=stage_dependent,
                       gamma=gamma,
                       real_time_dp=real_time_dp,
                       bonus_scale_factor=0.1)
    agent.fit()
    agent.policy(env.observation_space.sample())
Пример #13
0
def test_ucbvi(gamma, stage_dependent, bernoullized_reward):
    env = GridWorld(walls=(), nrows=5, ncols=5)
    agent = PSRLAgent(
        env,
        horizon=11,
        bernoullized_reward=bernoullized_reward,
        stage_dependent=stage_dependent,
        gamma=gamma,
    )
    agent.fit(budget=50)
    agent.policy(env.observation_space.sample())
Пример #14
0
def test_lsvi_ucb_matrix_inversion(FeatMapClass):
    env = GridWorld(nrows=3, ncols=3, walls=())

    def feature_map_fn():
        return FeatMapClass(env.observation_space.n, env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=10,
                         feature_map_fn=feature_map_fn,
                         horizon=10)
    agent.fit()
    assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv)
    assert agent.episode == 10
    agent.policy(env.observation_space.sample())
Пример #15
0
def test_uncertainty_est_wrapper():
    env = GridWorld()

    def uncertainty_est_fn(observation_space, action_space):
        return DiscreteCounter(observation_space, action_space)

    w_env = UncertaintyEstimatorWrapper(env,
                                        uncertainty_est_fn,
                                        bonus_scale_factor=1.0)

    for ii in range(10):
        w_env.reset()
        _, _, _, info = w_env.step(0)
        nn = w_env.uncertainty_estimator.count(0, 0)
        assert nn == ii + 1
        assert info["exploration_bonus"] == pytest.approx(1 / np.sqrt(nn))
Пример #16
0
def test_mc_policy_eval(gamma, horizon, stationary_policy):
    env = GridWorld(nrows=3,
                    ncols=3,
                    start_coord=(0, 0),
                    success_probability=1.0,
                    walls=(),
                    default_reward=0.0,
                    reward_at={(2, 2): 1.0})
    agent = ValueIterationAgent(env, gamma=gamma, horizon=horizon)
    agent.fit()

    episode_rewards = mc_policy_evaluation(agent,
                                           env,
                                           n_sim=5,
                                           gamma=gamma,
                                           stationary_policy=stationary_policy)
    assert episode_rewards.mean() == 1.0 * np.power(gamma, 4)
Пример #17
0
def _get_filled_replay(max_replay_size):
    """runs env for ~ 2 * max_replay_size timesteps."""
    env = GridWorld(terminal_states=None)
    env = TimeLimit(env, max_episode_steps=200)
    env.reseed(123)

    rng = np.random.default_rng(456)
    buffer = replay.ReplayBuffer(
        max_replay_size,
        rng,
        max_episode_steps=env._max_episode_steps,
        enable_prioritized=True,
    )
    buffer.setup_entry("observations", np.float32)
    buffer.setup_entry("actions", np.uint32)
    buffer.setup_entry("rewards", np.float32)
    buffer.setup_entry("dones", bool)

    # Fill the replay buffer
    total_time = 0
    while True:
        if total_time > 2 * buffer._max_replay_size:
            break
        done = False
        obs = env.reset()
        while not done:
            total_time += 1
            action = env.action_space.sample()
            next_obs, reward, done, _ = env.step(action)
            buffer.append({
                "observations": obs,
                "actions": action,
                "rewards": reward,
                "dones": done,
            })
            obs = next_obs
            if done:
                buffer.end_episode()
    return buffer, env
Пример #18
0
 def reset(self):
     self.state = GridWorld.reset(self)
     state_to_return = self.state
     if self.array_observation:
         state_to_return = self._convert_index_to_float_coord(self.state)
     return state_to_return
Пример #19
0
from rlberry.agents.ucbvi import UCBVIAgent
from rlberry.agents.optql import OptQLAgent
from rlberry.envs.finite import GridWorld
from rlberry.stats import AgentStats, plot_episode_rewards
from rlberry.stats import MultipleStats

N_EP = 3000
HORIZON = 20
GAMMA = 1.0

env = GridWorld(nrows=5, ncols=10)

params = {}

params['ucbvi'] = {
    'n_episodes': N_EP,
    'horizon': HORIZON,
    'stage_dependent': True,
    'gamma': GAMMA,
    'real_time_dp': True,
    'bonus_scale_factor': 1.0,
}

params['optql'] = {
    'n_episodes': N_EP,
    'horizon': HORIZON,
    'gamma': GAMMA,
    'bonus_scale_factor': 1.0,
}

mstats = MultipleStats()
Пример #20
0
def test_lsvi_without_bonus():
    seeding.set_global_seed(123)

    def lsvi_debug_gather_data(agent):
        """
        Function to gather data sampling uniformly
        states and actions
        """
        N = agent.n_episodes*agent.horizon
        count = 0
        while count < N:
            state = agent.env.observation_space.sample()
            action = agent.env.action_space.sample()
            next_state, reward, done, info = agent.env.sample(state, action)
            #
            #
            feat = agent.feature_map.map(state, action)
            outer_prod = np.outer(feat, feat)
            inv = agent.lambda_mat_inv

            #
            agent.lambda_mat += np.outer(feat, feat)
            # update inverse
            agent.lambda_mat_inv -= \
                (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)

            # update history
            agent.reward_hist[count] = reward
            agent.state_hist.append(state)
            agent.action_hist.append(action)
            agent.nstate_hist.append(next_state)

            #
            tt = agent.total_time_steps
            agent.feat_hist[tt, :] = agent.feature_map.map(state, action)
            for aa in range(agent.env.action_space.n):
                agent.feat_ns_all_actions[tt, aa, :] = \
                    agent.feature_map.map(next_state, aa)

            # increments
            agent.total_time_steps += 1
            count += 1

    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=100,
                         feature_map_fn=feature_map_fn,
                         horizon=20,
                         gamma=0.99,
                         reg_factor=1e-5)

    lsvi_debug_gather_data(agent)
    # estimated Q
    S = env.observation_space.n
    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    print(Q)
    print("---")
    print(Q_est)

    print("-------")
    print(np.abs(Q-Q_est))
    # Check error
    assert Q_est == pytest.approx(Q, rel=0.01)
Пример #21
0
from rlberry.envs.finite import GridWorld

env = GridWorld(7, 10, walls=((2, 2), (3, 3)))
env.enable_rendering()
for tt in range(50):
    env.step(env.action_space.sample())
env.render()
Пример #22
0
from rlberry.agents.dynprog import ValueIterationAgent
from rlberry.envs.finite import GridWorld

env = GridWorld(7, 10, walls=((2, 2), (3, 3)))
agent = ValueIterationAgent(env, gamma=0.95)
info = agent.fit()
print(info)

env.enable_rendering()

state = env.reset()
for tt in range(200):
    action = agent.policy(state)
    next_s, _, done, _ = env.step(action)
    if done:
        break
    state = next_s

env.save_video("gridworld.mp4", framerate=5)
Пример #23
0
import numpy as np
from rlberry.agents.features import FeatureMap
from rlberry.envs.finite import GridWorld
from rlberry.stats import AgentStats, plot_episode_rewards,\
    compare_policies
from rlberry.agents.dynprog import ValueIterationAgent
from rlberry.agents.linear import LSVIUCBAgent

# Define environment
env = GridWorld(nrows=2, ncols=4, walls=(), success_probability=1.0)


# Create feature map
class OneHotFeatureMap(FeatureMap):
    def __init__(self, S, A):
        self.S = env.observation_space.n
        self.A = env.action_space.n
        self.shape = (S * A, )

    def map(self, observation, action):
        feat = np.zeros((self.S, self.A))
        feat[observation, action] = 1.0
        return feat.flatten()


# Function that returns an instance of a feature map
def feature_map_fn(env):
    return OneHotFeatureMap(env.observation_space.n, env.action_space.n)


params = {
Пример #24
0
def test_gridworld_aux_functions():
    env = GridWorld(nrows=5,
                    ncols=8,
                    walls=((1, 1), ),
                    reward_at={
                        (4, 4): 1,
                        (4, 3): -1
                    })
    env.log()  # from FiniteMDP
    env.render_ascii()  # from GridWorld
    vals = np.arange(env.observation_space.n)
    env.display_values(vals)
    env.print_transition_at(0, 0, "up")

    layout = env.get_layout_array(vals, fill_walls_with=np.inf)
    for rr in range(env.nrows):
        for cc in range(env.ncols):
            if (rr, cc) in env.walls:
                assert layout[rr, cc] == np.inf
            else:
                assert layout[rr, cc] == vals[env.coord2index[(rr, cc)]]
Пример #25
0
    def __init__(
        self,
        nrooms=7,
        reward_free=False,
        array_observation=False,
        room_size=5,
        success_probability=0.95,
        remove_walls=False,
        initial_state_distribution="center",
        include_traps=False,
    ):

        assert nrooms > 0, "nrooms must be > 0"
        assert initial_state_distribution in ("center", "uniform")

        self.reward_free = reward_free
        self.array_observation = array_observation
        self.nrooms = nrooms
        self.room_size = room_size
        self.success_probability = success_probability
        self.remove_walls = remove_walls
        self.initial_state_distribution = initial_state_distribution
        self.include_traps = include_traps

        # Max number of rooms/columns per row
        self.max_rooms_per_row = 5

        # Room size (default = 5x5)
        self.room_size = room_size

        # Grid size
        self.room_nrows = math.ceil(nrooms / self.max_rooms_per_row)
        if self.room_nrows > 1:
            self.room_ncols = self.max_rooms_per_row
        else:
            self.room_ncols = nrooms
        nrows = self.room_size * self.room_nrows + (self.room_nrows - 1)
        ncols = self.room_size * self.room_ncols + (self.room_ncols - 1)

        # # walls
        walls = []
        for room_col in range(self.room_ncols - 1):
            col = (room_col + 1) * (self.room_size + 1) - 1
            for jj in range(nrows):
                if (jj % (self.room_size + 1)) != (self.room_size // 2):
                    walls.append((jj, col))

        for room_row in range(self.room_nrows - 1):
            row = (room_row + 1) * (self.room_size + 1) - 1
            for jj in range(ncols):
                walls.append((row, jj))

        # process each room
        start_coord = None
        terminal_state = None
        self.traps = []
        count = 0
        for room_r in range(self.room_nrows):
            if room_r % 2 == 0:
                cols_iterator = range(self.room_ncols)
            else:
                cols_iterator = reversed(range(self.room_ncols))
            for room_c in cols_iterator:
                # existing rooms
                if count < self.nrooms:
                    # remove top wall
                    if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) or (
                        (room_c == 0) and (room_r % 2 == 1)
                    ):
                        if room_r != self.room_nrows - 1:
                            wall_to_remove = self._convert_room_coord_to_global(
                                room_r, room_c, self.room_size, self.room_size // 2
                            )
                            if wall_to_remove in walls:
                                walls.remove(wall_to_remove)
                # rooms to remove
                else:
                    for ii in range(-1, self.room_size + 1):
                        for jj in range(-1, self.room_size + 1):
                            wall_to_include = self._convert_room_coord_to_global(
                                room_r, room_c, ii, jj
                            )
                            if (
                                wall_to_include[0] >= 0
                                and wall_to_include[0] < nrows
                                and wall_to_include[1] >= 0
                                and wall_to_include[1] < ncols
                                and (wall_to_include not in walls)
                            ):
                                walls.append(wall_to_include)
                    pass

                # start coord
                if count == nrooms // 2:
                    start_coord = self._convert_room_coord_to_global(
                        room_r, room_c, self.room_size // 2, self.room_size // 2
                    )
                # terminal state
                if count == nrooms - 1:
                    terminal_state = self._convert_room_coord_to_global(
                        room_r, room_c, self.room_size // 2, self.room_size // 2
                    )
                # trap
                if include_traps:
                    self.traps.append(
                        self._convert_room_coord_to_global(
                            room_r,
                            room_c,
                            self.room_size // 2 + 1,
                            self.room_size // 2 + 1,
                        )
                    )
                count += 1

        terminal_states = (terminal_state,) + tuple(self.traps)

        if self.reward_free:
            reward_at = {}
        else:
            reward_at = {
                terminal_state: 1.0,
                start_coord: 0.01,
                (self.room_size // 2, self.room_size // 2): 0.1,
            }

        # Check remove_walls
        if remove_walls:
            walls = ()

        # Init base class
        GridWorld.__init__(
            self,
            nrows=nrows,
            ncols=ncols,
            start_coord=start_coord,
            terminal_states=terminal_states,
            success_probability=success_probability,
            reward_at=reward_at,
            walls=walls,
            default_reward=0.0,
        )

        # Check initial distribution
        if initial_state_distribution == "uniform":
            distr = np.ones(self.observation_space.n) / self.observation_space.n
            self.set_initial_state_distribution(distr)

        # spaces
        if self.array_observation:
            self.discrete_observation_space = self.observation_space
            self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
Пример #26
0
 Illustration of how to set up an MBQVI algorithm in rlberry.
 The environment chosen here is GridWorld environment.

.. video:: ../../video_plot_mbqvi.mp4
   :width: 600

"""
# sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_mbqvi.jpg'
from rlberry.agents.mbqvi import MBQVIAgent
from rlberry.envs.finite import GridWorld

params = {}
params["n_samples"] = 100  # samples per state-action pair
params["gamma"] = 0.99
params["horizon"] = None

env = GridWorld(7, 10, walls=((2, 2), (3, 3)), success_probability=0.6)
agent = MBQVIAgent(env, **params)
info = agent.fit()
print(info)

# evaluate policy in a deterministic version of the environment
env_eval = GridWorld(7, 10, walls=((2, 2), (3, 3)), success_probability=1.0)
env_eval.enable_rendering()
state = env_eval.reset()
for tt in range(50):
    action = agent.policy(state)
    next_s, _, _, _ = env_eval.step(action)
    state = next_s
video = env_eval.save_video("_video/video_plot_mbqvi.mp4")
Пример #27
0
from rlberry.agents.dynprog import ValueIterationAgent
from rlberry.envs.finite import GridWorld, Chain

for env in [Chain(), GridWorld(7, 10, walls=((2, 2), (3, 3)))]:
    agent = ValueIterationAgent(env, gamma=0.95)
    info = agent.fit()
    print(info)

    env.enable_rendering()

    state = env.reset()
    for tt in range(50):
        action = agent.policy(state)
        next_s, _, done, _ = env.step(action)
        if done:
            break
        state = next_s
    env.render()