Пример #1
0
    def warmup(
        self,
        brain_set: BrainSet,
        n_episodes: int,
        max_t: int,
        step_agents_fn: Callable = default_step_agents_fn,
        preprocess_brain_actions_for_env_fn:
        Callable = default_preprocess_brain_actions_for_env_fn,
        end_episode_criteria=np.all,
    ) -> None:
        """
        Act randomly in the environment, storing experience tuples in trajectory/memory buffers.
        Used to initialize memory objects such as prioritized experience replay
        :param brain_set: The agent brains to undergo training
        :param n_episodes: The number of episodes to train over
        :param step_agents_fn: Function used to update the agents with a new experience sampled from the environment
        :param preprocess_brain_actions_for_env_fn: Function used to preprocess actions from the agents before
         passing to the environment
        :param max_t: The maximum number of time steps allowed in each episode
        :param end_episode_criteria: Function acting on a list of booleans
            (identifying whether that agent's episode has terminated) to determine whether the episode is finished
        :return: None
        """
        print("Performing warmup with {} episodes and max_t={}".format(
            n_episodes, max_t))
        for brain in brain_set.brains():
            for agent in brain.agents:
                agent.set_mode('train')
                agent.set_warmup(True)

        t1 = time.time()
        for i_episode in range(1, n_episodes + 1):
            self.reset_env(train_mode=True)
            brain_states = self.get_next_states(brain_set)
            for t in range(max_t):
                next_brain_environment = self.step(
                    brain_set=brain_set,
                    brain_states=brain_states,
                    random_actions=True,
                    preprocess_brain_actions_for_env_fn=
                    preprocess_brain_actions_for_env_fn)
                step_agents_fn(brain_set, next_brain_environment, t)
                brain_states = {
                    brain_name:
                    next_brain_environment[brain_name]['next_states']
                    for brain_name in brain_states
                }

                all_dones = []
                for brain_name in brain_set.names():
                    all_dones.extend(
                        next_brain_environment[brain_name]['dones'])

                if end_episode_criteria(all_dones):
                    break

                print('\rEpisode {}\tTimestep: {:.2f}'.format(i_episode, t),
                      end="")
        print("Finished warmup in {}s".format(round(time.time() - t1)))
Пример #2
0
    def evaluate(
            self,
            brain_set: BrainSet,
            n_episodes: int = 5,
            max_t: int = 1000,
            brain_reward_accumulation_fn: Callable = lambda rewards: np.array(
                rewards),
            episode_reward_accumulation_fn: Callable = lambda
        brain_episode_scores: float(
            np.mean([
                np.mean(brain_episode_scores[brain_name])
                for brain_name in brain_episode_scores
            ])),
            end_of_episode_score_display_fn: Callable = lambda i_episode,
        episode_aggregated_score, training_scores:
        '\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(
            i_episode, episode_aggregated_score,
            training_scores.get_mean_sliding_scores()),
            sliding_window_size: int = 100,
            end_episode_criteria: Callable = np.all) -> Tuple[BrainSet, float]:
        """
        Evaluate the agent in the environment
        :param brain_set: The agent brains to undergo training
        :param n_episodes: The number of episodes to train over
        :param max_t: The maximum number of time steps allowed in each episode
        :param brain_reward_accumulation_fn:Function used to accumulate rewards for each brain
        :param episode_reward_accumulation_fn: Function used to aggregate rewards across brains
        :param end_of_episode_score_display_fn: Function used to print out end-of-episode scalar score
        :param sliding_window_size: Size of the sliding window to average episode scores over
        :param end_episode_criteria: Function acting on a list of booleans
            (identifying whether that agent's episode has terminated) to determine whether the episode is finished
        :return: Tuple of  (brain_set, average_score)
        """
        for brain in brain_set.brains():
            for agent in brain.agents:
                agent.set_mode('eval')
                agent.set_warmup(False)

        self.evaluation_scores = Scores(window_size=sliding_window_size)

        for i_episode in range(1, n_episodes + 1):
            self.reset_env(train_mode=False)
            brain_states = self.get_next_states(brain_set)

            brain_episode_scores = {
                brain_name: None
                for brain_name, brain in brain_set
            }

            for t in range(max_t):
                next_brain_environment = self.step(brain_set=brain_set,
                                                   brain_states=brain_states)

                brain_states = {
                    brain_name:
                    next_brain_environment[brain_name]['next_states']
                    for brain_name in brain_states
                }
                for brain_name in brain_episode_scores:
                    scores = brain_reward_accumulation_fn(
                        next_brain_environment[brain_name]['rewards'])
                    if brain_episode_scores[brain_name] is None:
                        brain_episode_scores[brain_name] = scores
                    else:
                        brain_episode_scores[brain_name] += scores

                all_dones = []
                for brain_name in brain_set.names():
                    all_dones.extend(
                        next_brain_environment[brain_name]['dones'])

                if end_episode_criteria(all_dones):
                    break

            episode_aggregated_score = episode_reward_accumulation_fn(
                brain_episode_scores)
            self.evaluation_scores.add(episode_aggregated_score)
            print(end_of_episode_score_display_fn(i_episode,
                                                  episode_aggregated_score,
                                                  self.evaluation_scores),
                  end='\n')
        average_score = self.evaluation_scores.get_mean_sliding_scores()
        return brain_set, average_score
Пример #3
0
    def train(
        self,
        brain_set: BrainSet,
        solved_score: Optional[float] = None,
        n_episodes=2000,
        max_t=1000,
        sliding_window_size: int = 100,
        step_agents_fn: Callable = default_step_agents_fn,
        step_episode_agents_fn: Callable = default_step_episode_agents_fn,
        brain_reward_accumulation_fn: Callable = lambda rewards: np.array(
            rewards),
        episode_reward_accumulation_fn: Callable = lambda brain_episode_scores:
        float(
            np.mean([
                np.mean(brain_episode_scores[brain_name])
                for brain_name in brain_episode_scores
            ])),
        preprocess_brain_actions_for_env_fn:
        Callable = default_preprocess_brain_actions_for_env_fn,
        end_episode_criteria: Callable = np.all,
        end_of_episode_score_display_fn: Callable = lambda i_episode,
        episode_aggregated_score, training_scores:
        '\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(
            i_episode, episode_aggregated_score,
            training_scores.get_mean_sliding_scores()),
        aggregate_end_of_episode_score_fn: Callable = lambda training_scores:
        training_scores.get_mean_sliding_scores()
    ) -> Tuple[BrainSet, Scores, int, float]:
        """
        Train a set of agents (brain-set) in an environment
        :param brain_set: The agent brains to undergo training
        :param solved_score: The score (averaged over sliding_window_size episodes) required to consider the task solved
        :param n_episodes: The number of episodes to train over
        :param max_t: The maximum number of time steps allowed in each episode
        :param sliding_window_size: Size of the sliding window to average episode scores over
        :param step_agents_fn: Function used to update the agents with a new experience sampled from the environment
        :param step_episode_agents_fn: Function used to step the agents at the end of each episode
        :param preprocess_brain_actions_for_env_fn: Function used to preprocess actions from the agents before
         passing to the environment
        :param brain_reward_accumulation_fn:Function used to accumulate rewards for each brain
        :param episode_reward_accumulation_fn: Function used to aggregate rewards across brains
        :param end_of_episode_score_display_fn: Function used to print out end-of-episode scalar score
        :param end_episode_criteria: Function acting on a list of booleans
            (identifying whether that agent's episode has terminated) to determine whether the episode is finished
        :param aggregate_end_of_episode_score_fn: Function used to aggregate the end-of-episode score function.
            Defaults to averaging over the past sliding_window_size episode scores
        :return: Tuple of  (brain_set, Scores, i_episode, average_score)
            brain_set (BrainSet): The trained BrainSet
            Scores (Scores): Scores object containing all historic and sliding-window scores
            i_episode (int): The number of episodes required to solve the task
            average_score (float): The final averaged score
        """

        for brain in brain_set.brains():
            for agent in brain.agents:
                agent.set_mode('train')
                agent.set_warmup(False)

        self.training_scores = Scores(window_size=sliding_window_size)

        t_start = time.time()
        for i_episode in range(1, n_episodes + 1):
            self.reset_env(train_mode=True)
            brain_states = self.get_next_states(brain_set)

            brain_episode_scores = OrderedDict([
                (brain_name, None) for brain_name, brain in brain_set
            ])

            for t in range(max_t):
                next_brain_environment = self.step(
                    brain_set=brain_set,
                    brain_states=brain_states,
                    preprocess_brain_actions_for_env_fn=
                    preprocess_brain_actions_for_env_fn)
                step_agents_fn(brain_set, next_brain_environment, t)

                brain_states = {
                    brain_name:
                    next_brain_environment[brain_name]['next_states']
                    for brain_name in brain_states
                }

                for brain_name in brain_episode_scores:
                    # Brain rewards are a scalar for each agent,
                    # of form next_brain_environment[brain_name]['rewards']=[0.0, 0.0]
                    brain_rewards = brain_reward_accumulation_fn(
                        next_brain_environment[brain_name]['rewards'])
                    if brain_episode_scores[brain_name] is None:
                        brain_episode_scores[brain_name] = brain_rewards
                    else:
                        brain_episode_scores[brain_name] += brain_rewards

                all_dones = []
                for brain_name in brain_set.names():
                    all_dones.extend(
                        next_brain_environment[brain_name]['dones'])

                if end_episode_criteria(all_dones):
                    break

            # Step episode for agents
            step_episode_agents_fn(brain_set, i_episode)

            # Brain episode scores are of form: {'<brain_name>', <output_of_brain_reward_accumulation_fn>]}
            episode_aggregated_score = episode_reward_accumulation_fn(
                brain_episode_scores)
            self.training_scores.add(episode_aggregated_score)

            if i_episode % 100 == 0:
                end = '\n'
            else:
                end = ""

            print(end_of_episode_score_display_fn(i_episode,
                                                  episode_aggregated_score,
                                                  self.training_scores),
                  end=end)
            if solved_score and aggregate_end_of_episode_score_fn(
                    self.training_scores) >= solved_score:
                print("\nTotal Training time = {:.1f} min".format(
                    (time.time() - t_start) / 60))
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode,
                            self.training_scores.get_mean_sliding_scores()))
                break
        training_time = round(time.time() - t_start)

        return brain_set, self.training_scores, i_episode, training_time