def get_env(args): if args.env == 'GridWorld': from envs.gridworld import GridworldEnv env = GridworldEnv() eval_env = GridworldEnv() mdp = environments.build_gridworld() elif args.env == 'WindyGridWorld': from envs.windy_gridworld import WindyGridworldEnv env = WindyGridworldEnv() eval_env = WindyGridworldEnv() mdp = environments.build_windy_gridworld() elif args.env == 'CliffWalking': env = gym.make("CliffWalking-v0") eval_env = gym.make("CliffWalking-v0") elif args.env == 'FrozenLake': env = gym.make("FrozenLake-v0") eval_env = gym.make("FrozenLake-v0") mdp = environments.build_FrozenLake() elif args.env == 'FrozenLake8': env = gym.make("FrozenLake8x8-v0") eval_env = gym.make("FrozenLake8x8-v0") elif args.env == 'Taxi': env = gym.make("Taxi-v2") eval_env = gym.make("Taxi-v2") elif args.env=='twostateMDP': from envs.twostateMDP import twostateMDP env = gym.make('twostateMDP-v0') eval_env=gym.make('twostateMDP-v0') mdp = environments.mdp_fig2d() args.env = environments.mdp_fig2d return env, eval_env, mdp
def build_gridworld(): env = GridworldEnv() P = np.zeros( (env.action_space.n, env.observation_space.n, env.observation_space.n)) R = np.zeros((env.observation_space.n, env.action_space.n)) for s in range(env.observation_space.n): for a in range(env.action_space.n): for p, ns, r, _ in env.P[s][a]: P[a, s, ns] += p R[s, a] += p * r initial_distribution = np.ones(env.observation_space.n) / np.sum( np.ones(env.observation_space.n)) gamma = 0.9 return P, R, gamma, initial_distribution
return np.array(unpacked_elite_batch_obs), np.array(unpacked_elite_batch_actions), reward_threshold def gen_action_distribution(action_index, action_dim=5): action_distribution = np.zeros(action_dim).astype(type(action_index)) action_distribution[action_index] = 1 action_distribution = np.expand_dims(action_distribution, 0) return action_distribution if __name__ == "__main__": total_trajectory_rollouts = 70 elitism_criterion = 70 num_epochs = 100 mean_rewards = [] elite_reward_thresholds = [] env = GridworldEnv() agent = Agent(env.action_space.n, env.observation_space.shape) for i in tqdm(range(num_epochs)): trajectories = [Trajectory(*rollout(agent, env)) for _ in range(total_trajectory_rollouts)] _, _, batch_reward = zip(*trajectories) elite_obs, elite_actions, elite_threshold = gather_elite_xp(trajectories, elitism_criterion=elitism_criterion) elite_action_distributions = np.array([gen_action_distribution(a.item()) for a in elite_actions]) elite_obs, elite_action_distributions = elite_obs.astype("float16"), elite_action_distributions.astype("float16") agent.learn(elite_obs, elite_action_distributions, batch_size=128, epochs=3, verbose=0) mean_rewards.append(np.mean(batch_reward))
states.append(state) actions.append(action) rewards.append(reward) if done: break else: i += 1 if i >= timeout: timeouted = True break if not timeouted: episode = (states, actions, rewards) agent.update(episode) if __name__ == '__main__': from envs.gridworld import GridworldEnv nx, ny = 5, 5 env = GridworldEnv([ny, nx]) mc_agent = MCAgent(gamma=1.0, lr=1e-3, num_states=nx * ny, num_actions=4, epsilon=1.0) run_episode(env, mc_agent)