def test_step_takes_steps_for_all_non_waiting_envs(self): SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker( worker_id, EnvironmentResponse("step", worker_id, worker_id) ) manager = SubprocessEnvManager(mock_env_factory, 3) manager.step_queue = Mock() manager.step_queue.get_nowait.side_effect = [ EnvironmentResponse("step", 0, StepResponse(0, None)), EnvironmentResponse("step", 1, StepResponse(1, None)), EmptyQueue(), ] step_mock = Mock() last_steps = [Mock(), Mock(), Mock()] manager.env_workers[0].previous_step = last_steps[0] manager.env_workers[1].previous_step = last_steps[1] manager.env_workers[2].previous_step = last_steps[2] manager.env_workers[2].waiting = True manager._take_step = Mock(return_value=step_mock) res = manager.step() for i, env in enumerate(manager.env_workers): if i < 2: env.send.assert_called_with("step", step_mock) manager.step_queue.get_nowait.assert_called() # Check that the "last steps" are set to the value returned for each step self.assertEqual( manager.env_workers[i].previous_step.current_all_brain_info, i ) self.assertEqual( manager.env_workers[i].previous_step.previous_all_brain_info, last_steps[i].current_all_brain_info, ) assert res == [ manager.env_workers[0].previous_step, manager.env_workers[1].previous_step, ]
def advance(self, env: SubprocessEnvManager) -> int: with hierarchical_timer("env_step"): time_start_step = time() new_step_infos = env.step() delta_time_step = time() - time_start_step for step_info in new_step_infos: for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) trainer.add_experiences( step_info.previous_all_brain_info, step_info.current_all_brain_info, step_info.brain_name_to_action_info[brain_name].outputs, ) trainer.process_experiences(step_info.previous_all_brain_info, step_info.current_all_brain_info) for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step(len(new_step_infos)) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() env.set_policy(brain_name, trainer.policy) return len(new_step_infos)