예제 #1
0
 def advance(self, env: EnvManager) -> int:
     with hierarchical_timer("env_step"):
         new_step_infos = env.step()
     for step_info in new_step_infos:
         for brain_name, trainer in self.trainers.items():
             if step_info.has_actions_for_brain(brain_name):
                 _processor = self.managers[brain_name].processor
                 _processor.add_experiences(
                     step_info.previous_all_brain_info[brain_name],
                     step_info.current_all_brain_info[brain_name],
                     step_info.brain_name_to_action_info[brain_name].
                     outputs,
                 )
     for brain_name, trainer in self.trainers.items():
         if self.train_model and trainer.get_step <= trainer.get_max_steps:
             trainer.increment_step(len(new_step_infos))
             if trainer.is_ready_update():
                 # Perform gradient descent with experience buffer
                 with hierarchical_timer("update_policy"):
                     trainer.update_policy()
                 env.set_policy(brain_name, trainer.policy)
         else:
             # Avoid memory leak during inference
             # Eventually this whole block will take place in advance()
             # But currently this only calls clear_update_buffer() in RLTrainer
             # and nothing in the base class
             trainer.advance()
     return len(new_step_infos)
    def _create_trainer_and_manager(self, env_manager: EnvManager,
                                    name_behavior_id: str) -> None:
        brain_name = BehaviorIdentifiers.from_name_behavior_id(
            name_behavior_id).brain_name
        try:
            trainer = self.trainers[brain_name]
        except KeyError:
            trainer = self.trainer_factory.generate(brain_name)
            self.trainers[brain_name] = trainer
            self.logger.info(trainer)
            if self.train_model:
                trainer.write_tensorboard_text("Hyperparameters",
                                               trainer.parameters)

        # print("*/*/*/*/*/*/*/*/*/*/*/*/*")
        # print(trainer)
        # print("*/*/*/*/*/*/*/*/*/*/*/*/*")
        policy = trainer.create_policy(
            env_manager.external_brains[name_behavior_id])
        trainer.add_policy(name_behavior_id, policy)
        agent_manager = AgentManager(
            policy,
            name_behavior_id,
            trainer.stats_reporter,
            trainer.parameters.get("time_horizon", sys.maxsize),
        )

        env_manager.set_agent_manager(name_behavior_id, agent_manager)
        env_manager.set_policy(name_behavior_id, policy)
        self.brain_name_to_identifier[brain_name].add(name_behavior_id)

        trainer.publish_policy_queue(agent_manager.policy_queue)
        trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
예제 #3
0
 def start_trainer(self, trainer: Trainer, env_manager: EnvManager) -> None:
     self.trainers[trainer.brain_name] = trainer
     self.logger.info(trainer)
     if self.train_model:
         trainer.write_tensorboard_text("Hyperparameters",
                                        trainer.parameters)
     env_manager.set_policy(trainer.brain_name, trainer.policy)
예제 #4
0
    def _create_trainer_and_manager(self, env_manager: EnvManager,
                                    name_behavior_id: str) -> None:

        parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(
            name_behavior_id)
        brain_name = parsed_behavior_id.brain_name
        try:
            trainer = self.trainers[brain_name]
        except KeyError:
            trainer = self.trainer_factory.generate(brain_name)
            self.trainers[brain_name] = trainer

        policy = trainer.create_policy(
            parsed_behavior_id, env_manager.external_brains[name_behavior_id])
        trainer.add_policy(parsed_behavior_id, policy)

        agent_manager = AgentManager(
            policy,
            name_behavior_id,
            trainer.stats_reporter,
            trainer.parameters.get("time_horizon", sys.maxsize),
        )
        env_manager.set_agent_manager(name_behavior_id, agent_manager)
        env_manager.set_policy(name_behavior_id, policy)
        self.brain_name_to_identifier[brain_name].add(name_behavior_id)

        trainer.publish_policy_queue(agent_manager.policy_queue)
        trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
        if trainer.threaded:
            # Start trainer thread
            trainerthread = threading.Thread(target=self.trainer_update_func,
                                             args=(trainer, ),
                                             daemon=True)
            trainerthread.start()
            self.trainer_threads.append(trainerthread)
예제 #5
0
 def _get_and_process_experiences(self, env: EnvManager) -> int:
     with hierarchical_timer("env_step"):
         # Get new policies if found
         for brain_name in self.trainers.keys():
             for name_behavior_id in self.brain_name_to_identifier[
                     brain_name]:
                 try:
                     _policy = self.managers[
                         name_behavior_id].policy_queue.get_nowait()
                     env.set_policy(name_behavior_id, _policy)
                 except AgentManagerQueue.Empty:
                     pass
         # Step the environment
         new_step_infos = env.step()
     # Add to AgentProcessor
     for step_info in new_step_infos:
         for name_behavior_id in step_info.name_behavior_ids:
             if name_behavior_id not in self.managers:
                 self.logger.warning(
                     "Agent manager was not created for behavior id {}.".
                     format(name_behavior_id))
                 continue
             self.managers[name_behavior_id].add_experiences(
                 step_info.previous_all_brain_info[name_behavior_id],
                 step_info.current_all_brain_info[name_behavior_id],
                 step_info.brain_name_to_action_info[name_behavior_id].
                 outputs,
             )
     return len(new_step_infos)
예제 #6
0
    def _create_trainer_and_manager(
        self, env_manager: EnvManager, name_behavior_id: str
    ) -> None:

        parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
        brain_name = parsed_behavior_id.brain_name
        try:
            trainer = self.trainers[brain_name]
        except KeyError:
            trainer = self.trainer_factory.generate(brain_name)
            self.trainers[brain_name] = trainer

        policy = trainer.create_policy(
            parsed_behavior_id, env_manager.external_brains[name_behavior_id]
        )
        trainer.add_policy(parsed_behavior_id, policy)

        agent_manager = AgentManager(
            policy,
            name_behavior_id,
            trainer.stats_reporter,
            trainer.parameters.get("time_horizon", sys.maxsize),
        )
        env_manager.set_agent_manager(name_behavior_id, agent_manager)
        env_manager.set_policy(name_behavior_id, policy)
        self.brain_name_to_identifier[brain_name].add(name_behavior_id)

        trainer.publish_policy_queue(agent_manager.policy_queue)
        trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
예제 #7
0
 def reset_env_if_ready(self, env: EnvManager) -> None:
     # Get the sizes of the reward buffers.
     reward_buff = {
         k: list(t.reward_buffer)
         for (k, t) in self.trainers.items()
     }
     curr_step = {k: int(t.step) for (k, t) in self.trainers.items()}
     max_step = {
         k: int(t.get_max_steps)
         for (k, t) in self.trainers.items()
     }
     # Attempt to increment the lessons of the brains who
     # were ready.
     updated, param_must_reset = self.param_manager.update_lessons(
         curr_step, max_step, reward_buff)
     if updated:
         for trainer in self.trainers.values():
             trainer.reward_buffer.clear()
     # If ghost trainer swapped teams
     ghost_controller_reset = self.ghost_controller.should_reset()
     if param_must_reset or ghost_controller_reset:
         self._reset_env(env)  # This reset also sends the new config to env
         self.end_trainer_episodes()
     elif updated:
         env.set_env_parameters(self.param_manager.get_current_samplers())
예제 #8
0
    def _reset_env(self, env: EnvManager) -> None:
        """Resets the environment.

        Returns:
            A Data structure corresponding to the initial reset state of the
            environment.
        """
        new_config = self.param_manager.get_current_samplers()
        env.reset(config=new_config)
    def _reset_env(self, env: EnvManager) -> None:
        """Resets the environment.

        Returns:
            A Data structure corresponding to the initial reset state of the
            environment.
        """
        new_meta_curriculum_config = (self.meta_curriculum.get_config()
                                      if self.meta_curriculum else None)
        env.reset(config=new_meta_curriculum_config)
예제 #10
0
    def _reset_env(self, env_manager: EnvManager) -> None:
        """Resets the environment.

        Returns:
            A Data structure corresponding to the initial reset state of the
            environment.
        """
        new_config = self.param_manager.get_current_samplers()
        env_manager.reset(config=new_config)
        # Register any new behavior ids that were generated on the reset.
        self._register_new_behaviors(env_manager, env_manager.first_step_infos)
    def _reset_env(self, env_manager: EnvManager) -> None:
        """Resets the environment.

        Returns:
            A Data structure corresponding to the initial reset state of the
            environment.
        """
        sampled_reset_param = self.sampler_manager.sample_all()
        new_meta_curriculum_config = (self.meta_curriculum.get_config()
                                      if self.meta_curriculum else {})
        sampled_reset_param.update(new_meta_curriculum_config)
        env_manager.reset(config=sampled_reset_param)
예제 #12
0
 def _get_and_process_experiences(self, env: EnvManager) -> int:
     with hierarchical_timer("env_step"):
         # Get new policies if found
         for brain_name in self.trainers.keys():
             for name_behavior_id in self.brain_name_to_identifier[
                     brain_name]:
                 try:
                     _policy = self.managers[
                         name_behavior_id].policy_queue.get_nowait()
                     env.set_policy(name_behavior_id, _policy)
                 except AgentManagerQueue.Empty:
                     pass
         # Step the environment
         new_step_infos = env.step()
     # Add to AgentProcessor
     num_step_infos = self._process_step_infos(new_step_infos)
     return num_step_infos
예제 #13
0
    def _create_trainer_and_manager(self, env_manager: EnvManager,
                                    name_behavior_id: str) -> None:

        parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(
            name_behavior_id)
        brain_name = parsed_behavior_id.brain_name
        trainerthread = None
        if brain_name in self.trainers:
            trainer = self.trainers[brain_name]
        else:
            trainer = self.trainer_factory.generate(brain_name)
            self.trainers[brain_name] = trainer
            if trainer.threaded:
                # Only create trainer thread for new trainers
                trainerthread = threading.Thread(
                    target=self.trainer_update_func,
                    args=(trainer, ),
                    daemon=True)
                self.trainer_threads.append(trainerthread)

        policy = trainer.create_policy(
            parsed_behavior_id,
            env_manager.training_behaviors[name_behavior_id],
            create_graph=True,
        )
        trainer.add_policy(parsed_behavior_id, policy)

        agent_manager = AgentManager(
            policy,
            name_behavior_id,
            trainer.stats_reporter,
            trainer.parameters.time_horizon,
            threaded=trainer.threaded,
        )
        env_manager.set_agent_manager(name_behavior_id, agent_manager)
        env_manager.set_policy(name_behavior_id, policy)
        self.brain_name_to_identifier[brain_name].add(name_behavior_id)

        trainer.publish_policy_queue(agent_manager.policy_queue)
        trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)

        # Only start new trainers
        if trainerthread is not None:
            trainerthread.start()
예제 #14
0
    def advance(self, env: EnvManager) -> int:
        with hierarchical_timer("env_step"):
            new_step_infos = env.step()
        for step_info in new_step_infos:
            for name_behavior_id in step_info.name_behavior_ids:
                if name_behavior_id not in self.managers:
                    self.logger.warning(
                        "Agent manager was not created for behavior id {}.".
                        format(name_behavior_id))
                    continue
                _processor = self.managers[name_behavior_id].processor
                _processor.add_experiences(
                    step_info.previous_all_brain_info[name_behavior_id],
                    step_info.current_all_brain_info[name_behavior_id],
                    step_info.brain_name_to_action_info[name_behavior_id].
                    outputs,
                )

        for brain_name, trainer in self.trainers.items():
            if self.train_model and trainer.get_step <= trainer.get_max_steps:
                n_steps = len(new_step_infos)
                trainer.increment_step(n_steps)
                for name_behavior_id in self.brain_name_to_identifier[
                        brain_name]:
                    trainer.get_policy(name_behavior_id).increment_step(
                        n_steps)
                if trainer.is_ready_update():
                    # Perform gradient descent with experience buffer
                    with hierarchical_timer("update_policy"):
                        trainer.update_policy()
                    for name_behavior_id in self.brain_name_to_identifier[
                            brain_name]:
                        env.set_policy(name_behavior_id,
                                       trainer.get_policy(name_behavior_id))
            else:
                # Avoid memory leak during inference
                # Eventually this whole block will take place in advance()
                # But currently this only calls clear_update_buffer() in RLTrainer
                # and nothing in the base class
                trainer.advance()
        return len(new_step_infos)
예제 #15
0
    def advance(self, env_manager: EnvManager) -> int:
        # Get steps
        with hierarchical_timer("env_step"):
            new_step_infos = env_manager.get_steps()
            self._register_new_behaviors(env_manager, new_step_infos)
            num_steps = env_manager.process_steps(new_step_infos)

        # Report current lesson for each environment parameter
        for (
                param_name,
                lesson_number,
        ) in self.param_manager.get_current_lesson_number().items():
            for trainer in self.trainers.values():
                trainer.stats_reporter.set_stat(
                    f"Environment/Lesson Number/{param_name}", lesson_number)

        for trainer in self.trainers.values():
            if not trainer.threaded:
                with hierarchical_timer("trainer_advance"):
                    trainer.advance()

        return num_steps
예제 #16
0
 def advance(self, env: EnvManager) -> int:
     with hierarchical_timer("env_step"):
         time_start_step = time()
         new_step_infos = env.step()
         delta_time_step = time() - time_start_step
     for step_info in new_step_infos:
         for brain_name, trainer in self.trainers.items():
             if brain_name in self.trainer_metrics:
                 self.trainer_metrics[brain_name].add_delta_step(
                     delta_time_step)
             if step_info.has_actions_for_brain(brain_name):
                 trainer.add_experiences(
                     step_info.previous_all_brain_info[brain_name],
                     step_info.current_all_brain_info[brain_name],
                     step_info.brain_name_to_action_info[brain_name].
                     outputs,
                 )
                 trainer.process_experiences(
                     step_info.previous_all_brain_info[brain_name],
                     step_info.current_all_brain_info[brain_name],
                 )
     for brain_name, trainer in self.trainers.items():
         if brain_name in self.trainer_metrics:
             self.trainer_metrics[brain_name].add_delta_step(
                 delta_time_step)
         if self.train_model and trainer.get_step <= trainer.get_max_steps:
             trainer.increment_step(len(new_step_infos))
             if trainer.is_ready_update():
                 # Perform gradient descent with experience buffer
                 with hierarchical_timer("update_policy"):
                     trainer.update_policy()
                 env.set_policy(brain_name, trainer.policy)
         else:
             # Avoid memory leak during inference
             trainer.clear_update_buffer()
     return len(new_step_infos)
예제 #17
0
    def advance(self, env: EnvManager) -> int:
        # Get steps
        with hierarchical_timer("env_step"):
            num_steps = env.advance()

        # Report current lesson
        if self.meta_curriculum:
            for brain_name, curr in self.meta_curriculum.brains_to_curricula.items(
            ):
                if brain_name in self.trainers:
                    self.trainers[brain_name].stats_reporter.set_stat(
                        "Environment/Lesson", curr.lesson_num)

        # Advance trainers. This can be done in a separate loop in the future.
        with hierarchical_timer("trainer_advance"):
            for trainer in self.trainers.values():
                trainer.advance()

        return num_steps
예제 #18
0
    def advance(self, env: EnvManager) -> int:
        # Get steps
        with hierarchical_timer("env_step"):
            num_steps = env.advance()

        # Report current lesson
        if self.meta_curriculum:
            for brain_name, curr in self.meta_curriculum.brains_to_curricula.items(
            ):
                if brain_name in self.trainers:
                    self.trainers[brain_name].stats_reporter.set_stat(
                        "Environment/Lesson", curr.lesson_num)

        for trainer in self.trainers.values():
            if not trainer.threaded:
                with hierarchical_timer("trainer_advance"):
                    trainer.advance()

        return num_steps
예제 #19
0
    def start_learning(self, env_manager: EnvManager) -> None:
        self._create_model_path(self.model_path)
        tf.reset_default_graph()
        global_step = 0
        last_brain_behavior_ids: Set[str] = set()
        try:
            self._reset_env(env_manager)
            while self._not_done_training():
                external_brain_behavior_ids = set(
                    env_manager.external_brains.keys())
                new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
                for name_behavior_id in new_behavior_ids:
                    try:
                        brain_name, _ = name_behavior_id.split("?")
                    except ValueError:
                        brain_name = name_behavior_id

                    try:
                        trainer = self.trainers[brain_name]
                    except KeyError:
                        trainer = self.trainer_factory.generate(brain_name)
                        self.trainers[brain_name] = trainer
                        self.logger.info(trainer)
                        if self.train_model:
                            trainer.write_tensorboard_text(
                                "Hyperparameters", trainer.parameters)

                    policy = trainer.create_policy(
                        env_manager.external_brains[name_behavior_id])
                    trainer.add_policy(name_behavior_id, policy)

                    env_manager.set_policy(name_behavior_id, policy)

                    self.brain_name_to_identifier[brain_name].add(
                        name_behavior_id)

                    agent_manager = AgentManager(processor=AgentProcessor(
                        trainer,
                        policy,
                        name_behavior_id,
                        trainer.stats_reporter,
                        trainer.parameters.get("time_horizon", sys.maxsize),
                    ))
                    self.managers[name_behavior_id] = agent_manager

                last_brain_behavior_ids = external_brain_behavior_ids

                n_steps = self.advance(env_manager)
                for _ in range(n_steps):
                    global_step += 1
                    self.reset_env_if_ready(env_manager, global_step)
                    if self._should_save_model(global_step):
                        # Save Tensorflow model
                        self._save_model()
                    self.write_to_tensorboard(global_step)
            # Final save Tensorflow model
            if global_step != 0 and self.train_model:
                self._save_model()
        except (KeyboardInterrupt, UnityCommunicationException):
            if self.train_model:
                self._save_model_when_interrupted()
            pass
        if self.train_model:
            self._export_graph()
        self._write_timing_tree()