예제 #1
0
def random_grid_env(size_x,
                    size_y,
                    dim_obs=32,
                    time_limit=50,
                    wall_ratio=0.1,
                    smooth_obs=False,
                    distance_reward=True,
                    one_hot_obs=False,
                    seed=None,
                    absorb=False,
                    tabular=False):
    total_size = size_x * size_y
    locations = list(itertools.product(range(size_x), range(size_y)))
    start_loc = (int(size_x / 2), int(size_y / 2))
    locations.remove(start_loc)

    with math_utils.np_seed(seed):
        # randomly place walls
        wall_locs = random.sample(locations, int(total_size * wall_ratio))
        [locations.remove(loc) for loc in wall_locs]

        cand_reward_locs = random.sample(locations, int(0.25 * total_size))
        # pick furthest one from center
        cand_reward_dists = [
            np.linalg.norm(np.array(reward_loc) - start_loc)
            for reward_loc in cand_reward_locs
        ]
        furthest_reward = np.argmax(cand_reward_dists)
        reward_loc = cand_reward_locs[furthest_reward]
        locations.remove(cand_reward_locs[furthest_reward])

        gs = grid_spec_cy.spec_from_sparse_locations(
            size_x, size_y, {
                TileType.START: [start_loc],
                TileType.WALL: wall_locs,
                TileType.REWARD: [reward_loc]
            })

        if distance_reward:
            env = grid_env_cy.DistanceRewardGridEnv(gs, reward_loc[0],
                                                    reward_loc[1],
                                                    start_loc[0], start_loc[1])
        else:
            env = grid_env_cy.GridEnv(gs)
        env = env_wrapper.StochasticActionWrapper(env, eps=0.05)

        if absorb:
            env = env_wrapper.AbsorbingStateWrapper(env)
        if tabular:
            env = wrap_time(env, time_limit=time_limit)
        else:
            env = wrap_obs_time(env,
                                time_limit=time_limit,
                                one_hot_obs=one_hot_obs,
                                dim_obs=dim_obs,
                                smooth_obs=smooth_obs)
    return env
    def testSample1(self):
        buffer = replay_buffer_fqi.TabularReplayBuffer(self.env)
        buffer.add(0, 1, 0, 1.0)
        buffer.add(1, 0, 1, 0.0)

        with math_utils.np_seed(0):
            samples = buffer.sample(100)
        self.assertEqual(np.sum(samples[0]), 49)  # s
        self.assertEqual(np.sum(samples[1]), 51)  # a
        self.assertEqual(np.sum(samples[2]), 49)  # ns
        self.assertEqual(np.sum(samples[3]), 51)  # r
예제 #3
0
파일: simple_env.py 프로젝트: afcarl/rlutil
def random_env_register(Nstates,
                        Nact,
                        max_timesteps=20,
                        seed=None,
                        terminate=False,
                        t_sparsity=0.75,
                        deterministic=False,
                        dim_obs=-1):
    assert Nstates >= 2
    if seed is None:
        seed = 0
    reward_state = 0
    start_state = list(range(1, int(Nstates / 2)))
    with np_seed(seed):
        if not deterministic:
            transition_matrix = np.random.rand(Nstates, Nact, Nstates)
            transition_matrix = np.exp(transition_matrix)
            for s in range(Nstates):
                for a in range(Nact):
                    zero_idxs = np.random.randint(0,
                                                  Nstates,
                                                  size=int(Nstates *
                                                           t_sparsity))
                    transition_matrix[s, a, zero_idxs] = 0.0
            transition_matrix = transition_matrix / np.sum(
                transition_matrix, axis=2, keepdims=True)
        else:
            transition_matrix = np.zeros((Nstates, Nact, Nstates))
            trans_idx = np.random.randint(0, Nstates, size=(Nstates, Nact))
            for state in range(Nstates):
                for act in range(Nact):
                    transition_matrix[state, act, trans_idx[state, act]] = 1.0

        if dim_obs > 0:
            obs_matrix = np.random.randn(dim_obs, Nstates)
        else:
            obs_matrix = None

        reward = np.zeros((Nstates, Nact))
        reward[reward_state, :] = 1.0
        #reward = np.random.randn(Nstates,1 ) + reward

        stable_action = seed % Nact  #np.random.randint(0, Nact)
        transition_matrix[reward_state, stable_action] = np.zeros(Nstates)
        transition_matrix[reward_state, stable_action, reward_state] = 1
    return {
        'reward': reward,
        'init_state': start_state,
        'terminate_on_reward': terminate,
        'transition_matrix': transition_matrix,
        'max_timesteps': max_timesteps,
        'obs_matrix': obs_matrix,
    }
예제 #4
0
def get_env(name):
    if name == 'grid16randomobs':
        env = random_grid_env(16,
                              16,
                              dim_obs=16,
                              time_limit=50,
                              wall_ratio=0.2,
                              smooth_obs=False,
                              seed=0)
    elif name == 'grid16onehot':
        env = random_grid_env(16,
                              16,
                              time_limit=50,
                              wall_ratio=0.2,
                              one_hot_obs=True,
                              seed=0)
    elif name == 'grid16sparse':
        env = random_grid_env(16,
                              16,
                              time_limit=50,
                              wall_ratio=0.2,
                              one_hot_obs=True,
                              seed=0,
                              distance_reward=False)
    elif name == 'grid64randomobs':
        env = random_grid_env(64,
                              64,
                              dim_obs=64,
                              time_limit=100,
                              wall_ratio=0.2,
                              smooth_obs=False,
                              seed=0)
    elif name == 'grid64onehot':
        env = random_grid_env(64,
                              64,
                              time_limit=100,
                              wall_ratio=0.2,
                              one_hot_obs=True,
                              seed=0)
    elif name == 'cliffwalk':
        with math_utils.np_seed(0):
            env = tabular_env.CliffwalkEnv(25)
            # Cliffwalk is unsolvable by QI with moderate entropy - up the reward to reduce the effects.
            env = env_wrapper.AbsorbingStateWrapper(env, absorb_reward=10.0)
            env = wrap_obs_time(env, dim_obs=16, time_limit=50)
    elif name == 'pendulum':
        env = tabular_env.InvertedPendulum(state_discretization=32,
                                           action_discretization=5)
        env = wrap_time(env, time_limit=50)
    elif name == 'mountaincar':
        env = tabular_env.MountainCar(posdisc=56, veldisc=32)
        # MountainCar is unsolvable by QI with moderate entropy - up the reward to reduce the effects.
        env = env_wrapper.AbsorbingStateWrapper(env, absorb_reward=10.0)
        env = wrap_time(env, time_limit=100)
    elif name == 'sparsegraph':
        with math_utils.np_seed(0):
            env = tabular_env.RandomTabularEnv(num_states=500,
                                               num_actions=3,
                                               transitions_per_action=1,
                                               self_loop=True)
            env = env_wrapper.AbsorbingStateWrapper(env, absorb_reward=10.0)
            env = wrap_obs_time(env, dim_obs=4, time_limit=10)
    else:
        raise NotImplementedError('Unknown env id: %s' % name)
    return env