def start_learning(self, env_manager: EnvManager) -> None: self._create_output_path(self.output_path) tf.reset_default_graph() try: # Initial reset self._reset_env(env_manager) while self._not_done_training(): n_steps = self.advance(env_manager) for _ in range(n_steps): self.reset_env_if_ready(env_manager) # Stop advancing trainers self.join_threads() except ( KeyboardInterrupt, UnityCommunicationException, UnityEnvironmentException, UnityCommunicatorStoppedException, ) as ex: self.join_threads() self.logger.info( "Learning was interrupted. Please wait while the graph is generated." ) if isinstance(ex, KeyboardInterrupt) or isinstance( ex, UnityCommunicatorStoppedException ): pass else: # If the environment failed, we want to make sure to raise # the exception so we exit the process with an return code of 1. raise ex finally: if self.train_model: self._save_models()
def test_ppo_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = curiosity_dummy_config optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] update_buffer["curiosity_returns"] = update_buffer["environment_rewards"] update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"] # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas # in PyTorch it is saved as the total probability per branch. So we need to modify the # log prob in the fake buffer here. update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_sac_rnn_policy(dummy_config): # Test evaluate tf.reset_default_graph() policy = create_sac_policy_mock(dummy_config, use_rnn=True, use_discrete=True, use_visual=False) step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS) run_out = policy.evaluate(step, list(step.agent_id)) assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) # Test update buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain, memory_size=8) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] update_buffer = AgentBuffer() buffer.resequence_and_append(update_buffer, training_length=policy.sequence_length) run_out = policy.update( update_buffer, num_sequences=update_buffer.num_experiences // policy.sequence_length, )
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate tf.reset_default_graph() optimizer = create_test_ppo_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Policy/Learning Rate", "Policy/Epsilon", "Policy/Beta", ] for stat in required_stats: assert stat in return_stats.keys()
def test_update(mock_get_devices, mock_construct_feed_dict, mock_execute_model, dummy_config): tf.reset_default_graph() mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"] mock_construct_feed_dict.return_value = {} mock_execute_model.return_value = { "value_loss": 0.1, "policy_loss": 0.3, "update_batch": None, } trainer_parameters = dummy_config trainer_parameters["model_path"] = "" trainer_parameters["keep_checkpoints"] = 3 brain = create_mock_brainparams() policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) mock_mini_batch = mock.Mock() mock_mini_batch.items.return_value = [("action", [1, 2]), ("value", [3, 4])] run_out = policy.update(mock_mini_batch, 1) assert mock_mini_batch.items.call_count == len( mock_get_devices.return_value) assert mock_construct_feed_dict.call_count == len( mock_get_devices.return_value) assert run_out["Losses/Value Loss"] == 0.1 assert run_out["Losses/Policy Loss"] == 0.3
def test_ppo_optimizer_update_curiosity( curiosity_dummy_config, dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate tf.reset_default_graph() dummy_config["reward_signals"].update(curiosity_dummy_config) optimizer = _create_ppo_optimizer_ops_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["curiosity_returns"] = update_buffer["environment_rewards"] update_buffer["curiosity_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate tf.reset_default_graph() optimizer = _create_ppo_optimizer_ops_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update behavior_spec = optimizer.policy.behavior_spec update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not if discrete: n_agents = len(update_buffer["discrete_log_probs"]) update_buffer["discrete_log_probs"] = np.ones( (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))), dtype=np.float32, ) else: n_agents = len(update_buffer["continuous_log_probs"]) update_buffer["continuous_log_probs"] = np.ones( (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = gail_dummy_config optimizer = _create_ppo_optimizer_ops_mock( PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_sac_update_reward_signals(mock_env, dummy_config, discrete): # Test evaluate tf.reset_default_graph() # Add a Curiosity module dummy_config["reward_signals"]["curiosity"] = {} dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0 dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99 dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128 env, policy = create_sac_policy_mock(mock_env, dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False) # Test update, while removing PPO-specific buffer elements. update_buffer = mb.simulate_rollout( env, policy, BUFFER_INIT_SAMPLES, exclude_key_list=["advantages", "actions_pre"]) # Mock out reward signal eval update_buffer["extrinsic_rewards"] = update_buffer["rewards"] update_buffer["curiosity_rewards"] = update_buffer["rewards"] policy.update_reward_signals({"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences) env.close()
def start_learning(self, env_manager: EnvManager) -> None: self._create_model_path(self.model_path) tf.reset_default_graph() global_step = 0 last_brain_names: Set[str] = set() try: self._reset_env(env_manager) while self._not_done_training(): external_brains = set(env_manager.external_brains.keys()) new_brains = external_brains - last_brain_names if last_brain_names != env_manager.external_brains.keys(): for name in new_brains: trainer = self.trainer_factory.generate( env_manager.external_brains[name]) self.start_trainer(trainer, env_manager) last_brain_names = external_brains n_steps = self.advance(env_manager) for i in range(n_steps): global_step += 1 self.reset_env_if_ready(env_manager, global_step) if self._should_save_model(global_step): # Save Tensorflow model self._save_model() self.write_to_tensorboard(global_step) # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model() except (KeyboardInterrupt, UnityCommunicationException): if self.train_model: self._save_model_when_interrupted() pass if self.train_model: self._write_training_metrics() self._export_graph() self._write_timing_tree()
def test_ppo_model_dc_visual(): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): model = PPOModel( make_brain_parameters(discrete_action=True, visual_inputs=2)) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.all_log_probs, model.value, model.entropy, model.learning_rate, ] feed_dict = { model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32), model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32), model.action_masks: np.ones([2, 2], dtype=np.float32), } sess.run(run_list, feed_dict=feed_dict)
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") brain_infos = env.reset() brain_info = brain_infos[env.external_brain_names[0]] trainer_parameters = dummy_config model_path = env.external_brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy(0, env.brains[env.external_brain_names[0]], trainer_parameters, False, False) run_out = policy.get_value_estimates(brain_info, 0, done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 env.close()
def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete): tf.reset_default_graph() optimizer = _create_ppo_optimizer_ops_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, observation_shapes=optimizer.policy.behavior_spec.observation_shapes, max_step_complete=True, action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC, ) run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) for key, val in run_out.items(): assert type(key) is str assert len(val) == 15 run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=True) for key, val in final_value_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly optimizer.reward_signals["extrinsic"].use_terminal_states = False run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) for key, val in final_value_out.items(): assert type(key) is str assert val != 0.0
def start_learning(self, env_manager: EnvManager) -> None: self._create_model_path(self.model_path) tf.reset_default_graph() global_step = 0 last_brain_behavior_ids: Set[str] = set() try: self._reset_env(env_manager) while self._not_done_training(): external_brain_behavior_ids = set( env_manager.external_brains.keys()) new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids for name_behavior_id in new_behavior_ids: self._create_trainer_and_manager(env_manager, name_behavior_id) last_brain_behavior_ids = external_brain_behavior_ids n_steps = self.advance(env_manager) for _ in range(n_steps): global_step += 1 self.reset_env_if_ready(env_manager, global_step) if self._should_save_model(global_step): # Save Tensorflow model self._save_model() # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model() except (KeyboardInterrupt, UnityCommunicationException): if self.train_model: self._save_model_when_interrupted() pass if self.train_model: self._export_graph() self._write_timing_tree()
def test_ppo_model_cc_vector_rnn(): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): memory_size = 128 model = PPOModel( make_brain_parameters(discrete_action=False, visual_inputs=0), use_recurrent=True, m_size=memory_size, ) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.all_log_probs, model.value, model.entropy, model.learning_rate, model.memory_out, ] feed_dict = { model.batch_size: 1, model.sequence_length: 2, model.memory_in: np.zeros((1, memory_size), dtype=np.float32), model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.epsilon: np.array([[0, 1]]), } sess.run(run_list, feed_dict=feed_dict)
def test_average_gradients(mock_get_devices, dummy_config): tf.reset_default_graph() mock_get_devices.return_value = [ "/device:GPU:0", "/device:GPU:1", "/device:GPU:2", "/device:GPU:3", ] trainer_parameters = dummy_config trainer_parameters["model_path"] = "" trainer_parameters["keep_checkpoints"] = 3 brain = create_mock_brainparams() with tf.Session() as sess: policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) var = tf.Variable(0) tower_grads = [ [(tf.constant(0.1), var)], [(tf.constant(0.2), var)], [(tf.constant(0.3), var)], [(tf.constant(0.4), var)], ] avg_grads = policy.average_gradients(tower_grads) init = tf.global_variables_initializer() sess.run(init) run_out = sess.run(avg_grads) assert run_out == [(0.25, 0)]
def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete): tf.reset_default_graph() optimizer = _create_ppo_optimizer_ops_mock( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) time_horizon = 15 trajectory = _create_fake_trajectory(discrete, visual, time_horizon) run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False ) for key, val in run_out.items(): assert type(key) is str assert len(val) == 15 run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=True ) for key, val in final_value_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly optimizer.reward_signals["extrinsic"].use_terminal_states = False run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False ) for key, val in final_value_out.items(): assert type(key) is str assert val != 0.0
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate tf.reset_default_graph() optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas # in PyTorch it is saved as the total probability per branch. So we need to modify the # log prob in the fake buffer here. update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Policy/Learning Rate", "Policy/Epsilon", "Policy/Beta", ] for stat in required_stats: assert stat in return_stats.keys()
def init_weights(self, env_manager): self._reset_env(env_manager) tf.reset_default_graph() last_brain_behavior_ids: Set[str] = set() external_brain_behavior_ids = set(env_manager.external_brains.keys()) new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids self._create_trainers_and_managers(env_manager, new_behavior_ids) self.weights = deepcopy( self.trainers['Brain'].get_policy(0).get_weights())
def test_checkpoint_conversion(tmpdir, rnn, visual, discrete): tf.reset_default_graph() dummy_config = TrainerSettings() model_path = os.path.join(tmpdir, "Mock_Brain") policy = create_policy_mock( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) trainer_params = TrainerSettings() model_saver = TFModelSaver(trainer_params, model_path) model_saver.register(policy) model_saver.save_checkpoint("Mock_Brain", 100) assert os.path.isfile(model_path + "/Mock_Brain-100.nn")
def test_policy_evaluate(dummy_config, rnn, visual, discrete): # Test evaluate tf.reset_default_graph() policy = create_policy_mock( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS) run_out = policy.evaluate(step, list(step.agent_id)) if discrete: run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) else: assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" policy = PPOPolicy(0, brain_params, dummy_config, False, False) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 agentbuffer = trajectory.to_agentbuffer() batched_values = policy.get_batched_value_estimates(agentbuffer) for values in batched_values.values(): assert len(values) == 15
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate tf.reset_default_graph() optimizer = create_sac_optimizer_mock( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain) # Mock out reward signal eval update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_policy_evaluate(rnn, visual, discrete): # Test evaluate tf.reset_default_graph() policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) decision_step, terminal_step = mb.create_steps_from_behavior_spec( policy.behavior_spec, num_agents=NUM_AGENTS ) run_out = policy.evaluate(decision_step, list(decision_step.agent_id)) if discrete: run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) else: assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
def test_policy_conversion(dummy_config, tmpdir, rnn, visual, discrete): tf.reset_default_graph() dummy_config["output_path"] = os.path.join(tmpdir, "test") policy = create_policy_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) policy.save_model(1000) settings = SerializationSettings( policy.model_path, os.path.join(tmpdir, policy.brain.brain_name)) export_policy_model(settings, policy.graph, policy.sess) # These checks taken from test_barracuda_converter assert os.path.isfile(os.path.join(tmpdir, "test.nn")) assert os.path.getsize(os.path.join(tmpdir, "test.nn")) > 100
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = gail_dummy_config optimizer = _create_ppo_optimizer_ops_mock( attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW), use_rnn=False, use_discrete=False, use_visual=False, ) # Test update behavior_spec = optimizer.policy.behavior_spec update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not n_agents = len(update_buffer["continuous_log_probs"]) update_buffer["continuous_log_probs"] = np.ones( (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_create_model(mock_get_devices, dummy_config): tf.reset_default_graph() mock_get_devices.return_value = [ "/device:GPU:0", "/device:GPU:1", "/device:GPU:2", "/device:GPU:3", ] trainer_parameters = dummy_config trainer_parameters["model_path"] = "" trainer_parameters["keep_checkpoints"] = 3 brain = create_mock_brainparams() policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) assert len(policy.towers) == len(mock_get_devices.return_value)
def start_learning(self, env_manager: EnvManager) -> None: self._create_output_path(self.output_path) tf.reset_default_graph() global_step = 0 last_brain_behavior_ids: Set[str] = set() try: # Initial reset self._reset_env(env_manager) while self._not_done_training(): external_brain_behavior_ids = set( env_manager.external_brains.keys()) new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids self._create_trainers_and_managers(env_manager, new_behavior_ids) last_brain_behavior_ids = external_brain_behavior_ids n_steps = self.advance(env_manager) for _ in range(n_steps): global_step += 1 self.reset_env_if_ready(env_manager, global_step) if self._should_save_model(global_step): self._save_model() # Stop advancing trainers self.join_threads() # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model() except ( KeyboardInterrupt, UnityCommunicationException, UnityEnvironmentException, UnityCommunicatorStoppedException, ) as ex: self.join_threads() if self.train_model: self._save_model_when_interrupted() if isinstance(ex, KeyboardInterrupt) or isinstance( ex, UnityCommunicatorStoppedException): pass else: # If the environment failed, we want to make sure to raise # the exception so we exit the process with an return code of 1. raise ex finally: if self.train_model: self._export_graph()
def test_policy_conversion(tmpdir, rnn, visual, discrete): tf.reset_default_graph() dummy_config = TrainerSettings() policy = create_policy_mock( dummy_config, use_rnn=rnn, model_path=os.path.join(tmpdir, "test"), use_discrete=discrete, use_visual=visual, ) settings = SerializationSettings(policy.model_path, "MockBrain") checkpoint_path = f"{tmpdir}/MockBrain-1" policy.checkpoint(checkpoint_path, settings) # These checks taken from test_barracuda_converter assert os.path.isfile(checkpoint_path + ".nn") assert os.path.getsize(checkpoint_path + ".nn") > 100
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") brain_infos = env.reset() brain_info = brain_infos[env.external_brain_names[0]] trainer_parameters = dummy_config model_path = env.external_brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy(0, env.brains[env.external_brain_names[0]], trainer_parameters, False, False) run_out = policy.evaluate(brain_info) assert run_out["action"].shape == (3, 2) env.close()