Exemplo n.º 1
0
def test(env_name, episodes, params, render):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    policy.load_state_dict(params)
    agent = Agent(policy)

    test_reward, test_len = 0, 0
    for ep in range(episodes):
        done = False
        observation = env.reset()
        while not done:
            # Similar to the training loop above -
            # get the action, act on the environment, save total reward
            # (evaluation=True makes the agent always return what it thinks to be
            # the best action - there is no exploration at this point)
            action, _ = agent.get_action(observation, evaluation=True)
            observation, reward, done, info = env.step(
                action.detach().cpu().numpy())

            if render:
                env.render()
            test_reward += reward
            test_len += 1
    print("Average test reward:", test_reward / episodes, "episode length:",
          test_len / episodes)
Exemplo n.º 2
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', type=bool, default=False)
    parser.add_argument('--seed', type=int, default=101)
    parser.add_argument('--save_path',
                        '-s',
                        type=str,
                        default='save_model/ckpt.h5')
    parser.add_argument('--load', '-l', type=bool, default=False)
    parser.add_argument('--load_from',
                        '-lf',
                        type=str,
                        default='save_model/ckpt.h5')
    args = parser.parse_args()
    env = AirTrafficGym(args.seed)
    agent = Agent(state_size=env.observation_space[0],
                  action_size=env.action_space.n)
    if args.load:
        agent.load(args.load_from)
    if args.train:
        train(env, agent, save_path=args.save_path)
Exemplo n.º 3
0

# Create a Gym environment
env = CartPoleEnv()

# For CartPole - maximum episode length
env._max_episode_steps = 1000

# Get dimensionalities of actions and observations
action_space_dim = 1
observation_space_dim = 4

# Create the agent, value estimates and the policy
policy = Policy(observation_space_dim)
value_nn = Value(observation_space_dim)
agent = Agent(policy, value_nn)


def plot_heatmaps():
    xspace = np.linspace(-2.4, 2.4, 40)
    tspace = np.linspace(-0.3, 0.3, 40)

    val_estimates = np.zeros((40, 40))

    i = 0
    for x in xspace:
        j = 0
        for t in tspace:
            state = torch.from_numpy(np.array([x, 0, t, 0])).float()
            val_estimates[i, j] = agent.value.forward(state)
            j += 1
Exemplo n.º 4
0
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy, normalize=True)

    # Arrays to keep track of rewards
    reward_history, timestep_history = [], []
    average_reward_history = []

    # Run actual training
    for episode_number in range(train_episodes):
        reward_sum, timesteps = 0, 0
        done = False
        # Reset the environment and observe the initial state
        observation = env.reset()

        # Loop until the episode is over
        while not done:
            # Get action from the agent
            action, action_probabilities, state_val = agent.get_action(
                observation, ep=episode_number)
            previous_observation = observation

            # Perform the action on the environment, get new state and reward
            observation, reward, done, info = env.step(
                action.detach().cpu().numpy())

            # Store action's outcome (so that the agent can improve its policy)
            agent.store_outcome(previous_observation, action_probabilities,
                                action, reward, state_val)

            # Store total episode reward
            reward_sum += reward
            timesteps += 1

        if print_things:
            print("Episode {} finished. Total reward: {:.3g} ({} timesteps)".
                  format(episode_number, reward_sum, timesteps))

        # Bookkeeping (mainly for generating plots)
        reward_history.append(reward_sum)
        timestep_history.append(timesteps)
        if episode_number > 100:
            avg = np.mean(reward_history[-100:])
        else:
            avg = np.mean(reward_history)
        average_reward_history.append(avg)

        # Let the agent do its magic (update the policy)
        agent.episode_finished(episode_number)

    # Training is finished - plot rewards
    if print_things:
        plt.plot(reward_history)
        plt.plot(average_reward_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history")
        plt.show()
        print("Training finished.")
    data = pd.DataFrame({
        "episode": np.arange(len(reward_history)),
        "train_run_id": [train_run_id] * len(reward_history),
        # TODO: Change algorithm name for plots, if you want
        "algorithm": ["PG"] * len(reward_history),
        "reward": reward_history
    })
    torch.save(agent.policy.state_dict(),
               "model_%s_%d.mdl" % (env_name, train_run_id))
    return data
Exemplo n.º 5
0
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Arrays to keep track of rewards
    reward_history, timestep_history = [], []
    average_reward_history = []
    external_timestep = 0  # global timestep counter useful to concatenate different episode and to update the network every 10 timestep
    # Run actual training
    for episode_number in range(train_episodes):
        reward_sum, timesteps = 0, 0
        done = False
        # Reset the environment and observe the initial state
        observation = env.reset()

        # Loop until the episode is over
        while not done:

            action, action_probabilities = agent.get_action(observation)
            previous_observation = observation
            # Perform the action on the environment, get new state and reward
            observation, reward, done, info = env.step(action.detach().numpy())

            # Store action's outcome (so that the agent can improve its policy)
            agent.store_outcome(previous_observation, action_probabilities,
                                action, reward)

            # if the episode is done the next value is 0 otherwise compute the next values using the network
            if done:
                agent.store_next_values(torch.tensor(
                    [0.0]))  # save also the values of the next state
            else:
                x = torch.from_numpy(observation).float().to(
                    agent.train_device)
                _, v_next = agent.policy.forward(x)
                agent.store_next_values(
                    v_next)  # save also the values of the next state

            # Store total episode reward
            reward_sum += reward
            timesteps += 1
            #TASK 4
            # using an external timestep counter to concatenates the state between different episodes
            external_timestep += 1
            if external_timestep % 10 == 0:  # update the net every 10 timestep
                agent.episode_finished()

        if print_things and episode_number % 1 == 0:
            print("Episode {} finished. Total reward: {:.3g} ({} timesteps)".
                  format(episode_number, reward_sum, timesteps))

        # Bookkeeping (mainly for generating plots)
        reward_history.append(reward_sum)
        timestep_history.append(timesteps)
        if episode_number > 100:
            avg = np.mean(reward_history[-100:])
        else:
            avg = np.mean(reward_history)
        average_reward_history.append(avg)

    # Training is finished - plot rewards
    if print_things:
        plt.plot(reward_history)
        plt.plot(average_reward_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history")
        plt.savefig('./train_rew.jpg')
        plt.show()
        print("Training finished.")
    data = pd.DataFrame({
        "episode": np.arange(len(reward_history)),
        "train_run_id": [train_run_id] * len(reward_history),
        # TODO: Change algorithm name for plots, if you want
        "algorithm": ["PG"] * len(reward_history),
        "reward": reward_history
    })
    torch.save(agent.policy.state_dict(),
               "model_%s_%d.mdl" % (env_name, train_run_id))
    return data