Exemplo n.º 1
0
def get_env(args):
    if args.env == 'GridWorld':
        from envs.gridworld import GridworldEnv
        env = GridworldEnv()
        eval_env = GridworldEnv()
        mdp = environments.build_gridworld()

    elif args.env == 'WindyGridWorld':
        from envs.windy_gridworld import WindyGridworldEnv
        env = WindyGridworldEnv()
        eval_env = WindyGridworldEnv()
        mdp = environments.build_windy_gridworld()

    elif args.env == 'CliffWalking':
        env = gym.make("CliffWalking-v0")
        eval_env = gym.make("CliffWalking-v0")
    elif args.env == 'FrozenLake':
        env = gym.make("FrozenLake-v0")
        eval_env = gym.make("FrozenLake-v0")
        mdp = environments.build_FrozenLake()

    elif args.env == 'FrozenLake8':
        env = gym.make("FrozenLake8x8-v0")
        eval_env = gym.make("FrozenLake8x8-v0")
    elif args.env == 'Taxi':
        env = gym.make("Taxi-v2")
        eval_env = gym.make("Taxi-v2")
    elif args.env=='twostateMDP':
        from envs.twostateMDP import twostateMDP
        env = gym.make('twostateMDP-v0')
        eval_env=gym.make('twostateMDP-v0')
        mdp = environments.mdp_fig2d()
        args.env = environments.mdp_fig2d
    return env, eval_env, mdp
Exemplo n.º 2
0
def build_gridworld():
    env = GridworldEnv()
    P = np.zeros(
        (env.action_space.n, env.observation_space.n, env.observation_space.n))
    R = np.zeros((env.observation_space.n, env.action_space.n))
    for s in range(env.observation_space.n):
        for a in range(env.action_space.n):
            for p, ns, r, _ in env.P[s][a]:
                P[a, s, ns] += p
                R[s, a] += p * r
    initial_distribution = np.ones(env.observation_space.n) / np.sum(
        np.ones(env.observation_space.n))
    gamma = 0.9
    return P, R, gamma, initial_distribution
Exemplo n.º 3
0
    return np.array(unpacked_elite_batch_obs), np.array(unpacked_elite_batch_actions), reward_threshold

def gen_action_distribution(action_index, action_dim=5):
    action_distribution = np.zeros(action_dim).astype(type(action_index))
    action_distribution[action_index] = 1
    action_distribution = np.expand_dims(action_distribution, 0)
    return action_distribution

if __name__ == "__main__":
    total_trajectory_rollouts = 70
    elitism_criterion = 70
    num_epochs = 100
    mean_rewards = []
    elite_reward_thresholds = []
    
    env = GridworldEnv()
    agent = Agent(env.action_space.n, env.observation_space.shape)
    for i in tqdm(range(num_epochs)):
        trajectories = [Trajectory(*rollout(agent, env)) for _ in range(total_trajectory_rollouts)]
        
        _, _, batch_reward = zip(*trajectories)
        
        elite_obs, elite_actions, elite_threshold = gather_elite_xp(trajectories, elitism_criterion=elitism_criterion)
        
        elite_action_distributions = np.array([gen_action_distribution(a.item()) for a in elite_actions])
        
        elite_obs, elite_action_distributions = elite_obs.astype("float16"), elite_action_distributions.astype("float16")
        
        agent.learn(elite_obs, elite_action_distributions, batch_size=128, epochs=3, verbose=0)
        
        mean_rewards.append(np.mean(batch_reward))
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        if done:
            break
        else:
            i += 1
            if i >= timeout:
                timeouted = True
                break

    if not timeouted:
        episode = (states, actions, rewards)
        agent.update(episode)


if __name__ == '__main__':
    from envs.gridworld import GridworldEnv

    nx, ny = 5, 5
    env = GridworldEnv([ny, nx])

    mc_agent = MCAgent(gamma=1.0,
                       lr=1e-3,
                       num_states=nx * ny,
                       num_actions=4,
                       epsilon=1.0)

    run_episode(env, mc_agent)