示例#1
0
def ppo(env, brain_name, policy, config, train):
    if train:
        optimizier = optim.Adam(
            policy.parameters(),
            config['hyperparameters']['adam_learning_rate'],
            eps=config['hyperparameters']['adam_epsilon'])
        agent = PPOAgent(env, brain_name, policy, optimizier, config)
        all_scores = []
        averages = []
        last_max = 30.0

        for i in tqdm.tqdm(range(config['hyperparameters']['episode_count'])):
            agent.step()
            last_mean_reward = play_round(env, brain_name, policy, config)
            if i == 0:
                last_average = last_mean_reward
            else:
                last_average = np.mean(np.array(
                    all_scores[-100:])) if len(all_scores) > 100 else np.mean(
                        np.array(all_scores))

            all_scores.append(last_mean_reward)
            averages.append(last_average)
            if last_average > last_max:
                torch.save(
                    policy.state_dict(),
                    f"reacher-ppo/models/ppo-max-hiddensize-{config['hyperparameters']['hidden_size']}.pth"
                )
                last_max = last_average
            clear_output(True)
            print(
                'Episode: {} Total score this episode: {} Last {} average: {}'.
                format(i + 1, last_mean_reward, min(i + 1, 100), last_average))
        return all_scores, averages
    else:
        all_scores = []
        for i in range(20):
            score = play_round(env, brain_name, policy, config, train)
            all_scores.append(score)
        return [score], [np.mean(score)]
示例#2
0
                     n_hidden=args.n_hidden,
                     n_outs=n_outs,
                     td_n=args.td_n,
                     ppo_epochs=args.ppo_epochs,
                     mini_batch_size=args.mini_batch_size)
    if args.load_best_pretrained_model:
        agent.load_model('../models/ppo/model.pt')
        print('Loaded pretrained model')

    if args.test_env:
        state = env.reset()
        done = False
        score = 0
        while not done:
            env.render()
            dist, value = agent.step(state)

            action = dist.sample()
            state, reward, done, _ = env.step(action.cpu().numpy())
            score += reward
        print(score)
    else:
        scores = []
        state = envs.reset()
        next_state = None
        early_stop = False

        best_avg_score = args.best_avg_reward

        idx = 0
        while idx < args.max_frames and not early_stop:
def experiment(hidden_size=64,
               lr=3e-4,
               num_steps=2048,
               mini_batch_size=32,
               ppo_epochs=10,
               threshold_reward=10,
               max_episodes=15,
               nrmlz_adv=True,
               gamma=0.99,
               tau=0.95,
               clip_gradients=True):
    '''

    :param hidden_size: number of neurons for the layers of the model
    :param lr: learning rate
    :param num_steps: maximum duration of one epoch
    :param mini_batch_size: mini batch size for ppo
    :param ppo_epochs: number of epochs for ppo to learn
    :param threshold_reward: what is the goal of the training
    :param max_episodes: maximum duration of the training
    :param nrmlz_adv: True, if advantages should be normalized before PPO
    :param clip_gradients: True if gradients should ne clipped after PPO
    :return: list of scores and list of test_rewards
    '''

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    scores_window = deque(maxlen=100)
    test_rewards = []
    moving_averages = []

    env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = PPOAgent(learning_rate=lr,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     num_agents=num_agents,
                     random_seed=0,
                     ppo_epochs=ppo_epochs,
                     mini_batch_size=mini_batch_size,
                     normalize_advantages=nrmlz_adv,
                     clip_gradients=clip_gradients,
                     gamma=gamma,
                     tau=tau,
                     device=device)

    #    while episode < max_episodes and not early_stop:
    for episode in tqdm(range(max_episodes)):
        log_probs = []
        values = []
        states_list = []
        actions_list = []
        rewards = []
        masks = []
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        for duration in range(num_steps):

            state = torch.FloatTensor(state).to(device)
            action, value, log_prob = agent.act(state)
            env_info = env.step(action.cpu().data.numpy())[
                brain_name]  # send all actions to the environment

            next_state = env_info.vector_observations  # get next state (for each agent)
            reward = env_info.rewards  # get reward (for each agent)
            dones = np.array(env_info.local_done)  # see if episode finished
            if reward == None:
                pass

            log_probs.append(log_prob)
            values.append(value)
            reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device)
            masks_t = torch.FloatTensor(1 - dones)
            rewards.append(reward_t)
            masks.append(masks_t)
            states_list.append(state)
            actions_list.append(action)

            state = next_state

            if np.any(dones):
                break

        next_state = torch.FloatTensor(state).to(device)
        _, next_value, _ = agent.act(next_state)
        agent.step(states=states_list,
                   actions=actions_list,
                   values=values,
                   log_probs=log_probs,
                   rewards=rewards,
                   masks=masks,
                   next_value=next_value)

        test_mean_reward = test_agent(env, brain_name, agent, device)
        test_rewards.append(test_mean_reward)
        scores_window.append(test_mean_reward)
        moving_averages.append(np.mean(scores_window))
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.
              format(episode, test_mean_reward, min(episode, 100),
                     np.mean(scores_window)))
        if np.mean(scores_window) > threshold_reward:
            agent.save_model(
                f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth"
            )
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode, test_mean_reward))
            break

        episode += 1
    env.close()
    return scores_window, test_rewards, moving_averages
示例#4
0
def ppo():
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    env = UnityEnvironment(file_name="../Reacher_Linux/Reacher.x86_64",
                           no_graphics=True)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)

    config = Config()
    config.env = env

    config.actor_critic_fn = lambda: ActorCritic(
        actor=Actor(state_size, action_size), critic=Critic(state_size))

    config.discount = 0.99
    config.use_gae = True
    config.gae_tau = 0.95
    config.gradient_clip = 5
    config.rollout_length = 2048
    config.optimization_epochs = 5
    config.num_mini_batches = 512
    config.ppo_ratio_clip = 0.2
    config.log_interval = 10 * 2048
    config.max_steps = 2e7
    config.eval_episodes = 10
    # config.logger = get_logger()

    print("GPU available: {}".format(torch.cuda.is_available()))
    print("GPU tensor test: {}".format(torch.rand(3, 3).cuda()))

    agent = PPOAgent(config)

    random_seed()
    config = agent.config
    t0 = time.time()
    scores = []
    scores_window = deque(maxlen=100)  # last 100 scores

    while True:
        if config.log_interval and not agent.total_steps % config.log_interval and len(
                agent.episode_rewards):
            rewards = agent.episode_rewards
            for reward in rewards:
                scores.append(reward)
                scores_window.append(reward)
            agent.episode_rewards = []

            print('\r===> Average Score: {:d} episodes {:.2f}'.format(
                len(scores), np.mean(scores_window)))
            if np.mean(scores_window) >= 1.0:
                print(
                    '\nEnvironment solved in {:d}  episodes!\tAverage Score: {:.2f}'
                    .format(len(scores_window), np.mean(scores_window)))
                torch.save(agent.actor_critic.state_dict(),
                           '../checkpoints/ppo_checkpoint.pth')
                break

            print(
                'Total steps %d, returns %d/%.2f/%.2f/%.2f/%.2f (count/mean/median/min/max), %.2f steps/s'
                % (agent.total_steps, len(rewards), np.mean(rewards),
                   np.median(rewards), np.min(rewards), np.max(rewards),
                   config.log_interval / (time.time() - t0)))

            t0 = time.time()

        agent.step()

    return scores