def test(env_name, episodes, params, render): # Create a Gym environment env = gym.make(env_name) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) policy.load_state_dict(params) agent = Agent(policy) test_reward, test_len = 0, 0 for ep in range(episodes): done = False observation = env.reset() while not done: # Similar to the training loop above - # get the action, act on the environment, save total reward # (evaluation=True makes the agent always return what it thinks to be # the best action - there is no exploration at this point) action, _ = agent.get_action(observation, evaluation=True) observation, reward, done, info = env.step( action.detach().cpu().numpy()) if render: env.render() test_reward += reward test_len += 1 print("Average test reward:", test_reward / episodes, "episode length:", test_len / episodes)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--train', type=bool, default=False) parser.add_argument('--seed', type=int, default=101) parser.add_argument('--save_path', '-s', type=str, default='save_model/ckpt.h5') parser.add_argument('--load', '-l', type=bool, default=False) parser.add_argument('--load_from', '-lf', type=str, default='save_model/ckpt.h5') args = parser.parse_args() env = AirTrafficGym(args.seed) agent = Agent(state_size=env.observation_space[0], action_size=env.action_space.n) if args.load: agent.load(args.load_from) if args.train: train(env, agent, save_path=args.save_path)
# Create a Gym environment env = CartPoleEnv() # For CartPole - maximum episode length env._max_episode_steps = 1000 # Get dimensionalities of actions and observations action_space_dim = 1 observation_space_dim = 4 # Create the agent, value estimates and the policy policy = Policy(observation_space_dim) value_nn = Value(observation_space_dim) agent = Agent(policy, value_nn) def plot_heatmaps(): xspace = np.linspace(-2.4, 2.4, 40) tspace = np.linspace(-0.3, 0.3, 40) val_estimates = np.zeros((40, 40)) i = 0 for x in xspace: j = 0 for t in tspace: state = torch.from_numpy(np.array([x, 0, t, 0])).float() val_estimates[i, j] = agent.value.forward(state) j += 1
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000): # Create a Gym environment env = gym.make(env_name) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy, normalize=True) # Arrays to keep track of rewards reward_history, timestep_history = [], [] average_reward_history = [] # Run actual training for episode_number in range(train_episodes): reward_sum, timesteps = 0, 0 done = False # Reset the environment and observe the initial state observation = env.reset() # Loop until the episode is over while not done: # Get action from the agent action, action_probabilities, state_val = agent.get_action( observation, ep=episode_number) previous_observation = observation # Perform the action on the environment, get new state and reward observation, reward, done, info = env.step( action.detach().cpu().numpy()) # Store action's outcome (so that the agent can improve its policy) agent.store_outcome(previous_observation, action_probabilities, action, reward, state_val) # Store total episode reward reward_sum += reward timesteps += 1 if print_things: print("Episode {} finished. Total reward: {:.3g} ({} timesteps)". format(episode_number, reward_sum, timesteps)) # Bookkeeping (mainly for generating plots) reward_history.append(reward_sum) timestep_history.append(timesteps) if episode_number > 100: avg = np.mean(reward_history[-100:]) else: avg = np.mean(reward_history) average_reward_history.append(avg) # Let the agent do its magic (update the policy) agent.episode_finished(episode_number) # Training is finished - plot rewards if print_things: plt.plot(reward_history) plt.plot(average_reward_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history") plt.show() print("Training finished.") data = pd.DataFrame({ "episode": np.arange(len(reward_history)), "train_run_id": [train_run_id] * len(reward_history), # TODO: Change algorithm name for plots, if you want "algorithm": ["PG"] * len(reward_history), "reward": reward_history }) torch.save(agent.policy.state_dict(), "model_%s_%d.mdl" % (env_name, train_run_id)) return data
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000): # Create a Gym environment env = gym.make(env_name) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Arrays to keep track of rewards reward_history, timestep_history = [], [] average_reward_history = [] external_timestep = 0 # global timestep counter useful to concatenate different episode and to update the network every 10 timestep # Run actual training for episode_number in range(train_episodes): reward_sum, timesteps = 0, 0 done = False # Reset the environment and observe the initial state observation = env.reset() # Loop until the episode is over while not done: action, action_probabilities = agent.get_action(observation) previous_observation = observation # Perform the action on the environment, get new state and reward observation, reward, done, info = env.step(action.detach().numpy()) # Store action's outcome (so that the agent can improve its policy) agent.store_outcome(previous_observation, action_probabilities, action, reward) # if the episode is done the next value is 0 otherwise compute the next values using the network if done: agent.store_next_values(torch.tensor( [0.0])) # save also the values of the next state else: x = torch.from_numpy(observation).float().to( agent.train_device) _, v_next = agent.policy.forward(x) agent.store_next_values( v_next) # save also the values of the next state # Store total episode reward reward_sum += reward timesteps += 1 #TASK 4 # using an external timestep counter to concatenates the state between different episodes external_timestep += 1 if external_timestep % 10 == 0: # update the net every 10 timestep agent.episode_finished() if print_things and episode_number % 1 == 0: print("Episode {} finished. Total reward: {:.3g} ({} timesteps)". format(episode_number, reward_sum, timesteps)) # Bookkeeping (mainly for generating plots) reward_history.append(reward_sum) timestep_history.append(timesteps) if episode_number > 100: avg = np.mean(reward_history[-100:]) else: avg = np.mean(reward_history) average_reward_history.append(avg) # Training is finished - plot rewards if print_things: plt.plot(reward_history) plt.plot(average_reward_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history") plt.savefig('./train_rew.jpg') plt.show() print("Training finished.") data = pd.DataFrame({ "episode": np.arange(len(reward_history)), "train_run_id": [train_run_id] * len(reward_history), # TODO: Change algorithm name for plots, if you want "algorithm": ["PG"] * len(reward_history), "reward": reward_history }) torch.save(agent.policy.state_dict(), "model_%s_%d.mdl" % (env_name, train_run_id)) return data