def external_brains(self) -> Dict[str, BrainParameters]: result = {} for brain_name in self.env.get_agent_groups(): result[brain_name] = group_spec_to_brain_parameters( brain_name, self.env.get_agent_group_spec(brain_name) ) return result
def external_brains(): result = {} for brain_name in env.get_agent_groups(): result[brain_name] = group_spec_to_brain_parameters( brain_name, env.get_agent_group_spec(brain_name) ) return result
def external_brains(): result = {} brain_name = env.spec.id # for brain_name in env.get_agent_groups(): result[brain_name] = group_spec_to_brain_parameters( brain_name, { "observation_shapes": [(env.observation_space, )], "action_shape": (env.action_scheme.action_space.n, ), "action_type": 'DISCRETE' }) return result
def demo_to_buffer( file_path: str, sequence_length: int ) -> Tuple[BrainParameters, AgentBuffer]: """ Loads demonstration file and uses it to fill training buffer. :param file_path: Location of demonstration file (.demo). :param sequence_length: Length of trajectories to fill buffer. :return: """ group_spec, info_action_pair, _ = load_demonstration(file_path) demo_buffer = make_demo_buffer(info_action_pair, group_spec, sequence_length) brain_params = group_spec_to_brain_parameters("DemoBrain", group_spec) return brain_params, demo_buffer
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") env.reset() brain_name = env.get_agent_groups()[0] batched_step = env.get_step_result(brain_name) brain_params = group_spec_to_brain_parameters( brain_name, env.get_agent_group_spec(brain_name)) trainer_parameters = dummy_config model_path = brain_name trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy(0, brain_params, trainer_parameters, False, False) run_out = policy.evaluate(batched_step, list(batched_step.agent_id)) assert run_out["action"].shape == (3, 2) env.close()
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") env.reset() brain_name = env.get_agent_groups()[0] brain_info = step_result_to_brain_info( env.get_step_result(brain_name), env.get_agent_group_spec(brain_name)) brain_params = group_spec_to_brain_parameters( brain_name, env.get_agent_group_spec(brain_name)) trainer_parameters = dummy_config model_path = brain_name trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy(0, brain_params, trainer_parameters, False, False) run_out = policy.get_value_estimates(brain_info, 0, done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 env.close()