def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy: """ Creates a PPO policy to trainers list of policies. :param brain_parameters: specifications for policy construction :return policy """ if self.multi_gpu and len(get_devices()) > 1: policy: PPOPolicy = MultiGpuPPOPolicy( self.seed, brain_parameters, self.trainer_parameters, self.is_training, self.load, ) else: policy = PPOPolicy( self.seed, brain_parameters, self.trainer_parameters, self.is_training, self.load, ) for _reward_signal in policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) return policy
def __init__(self, sess, brain, reward_buff_cap, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(PPOTrainer, self).__init__(sess, brain.brain_name, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'graph_scope', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size' ] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException( "The hyperparameter {0} could not be found for the PPO trainer of " "brain {1}.".format(k, brain.brain_name)) self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, sess, self.is_training) stats = { 'cumulative_reward': [], 'episode_length': [], 'value_estimate': [], 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': [] } if self.use_curiosity: stats['forward_loss'] = [] stats['inverse_loss'] = [] stats['intrinsic_reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.training_buffer2 = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") brain_infos = env.reset() brain_info = brain_infos[env.external_brain_names[0]] trainer_parameters = dummy_config model_path = env.external_brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy(0, env.brains[env.external_brain_names[0]], trainer_parameters, False, False) run_out = policy.get_value_estimates(brain_info, 0, done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 env.close()
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = defaultdict(list) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {} self.stats = stats self.training_buffer = Buffer() self.episode_steps = {}
def __init__( self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id, multi_gpu, ): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() if multi_gpu and len(get_devices()) > 1: self.ppo_policy = MultiGpuPPOPolicy(seed, brain, trainer_parameters, self.is_training, load) else: self.ppo_policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) self.policy = self.ppo_policy for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {}
def create_policy_with_bc_mock(mock_brain, trainer_config, use_rnn, demo_file): # model_path = env.external_brain_names[0] trainer_config["model_path"] = "testpath" trainer_config["keep_checkpoints"] = 3 trainer_config["use_recurrent"] = use_rnn trainer_config["behavioral_cloning"]["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file) policy = (PPOPolicy(0, mock_brain, trainer_config, False, False) if trainer_config["trainer"] == "ppo" else SACPolicy( 0, mock_brain, trainer_config, False, False)) return policy
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size', 'model_path' ] self.check_param_keys() self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = { 'Environment/Cumulative Reward': [], 'Environment/Episode Length': [], 'Policy/Value Estimate': [], 'Policy/Entropy': [], 'Losses/Value Loss': [], 'Losses/Policy Loss': [], 'Policy/Learning Rate': [] } if self.use_curiosity: stats['Losses/Forward Loss'] = [] stats['Losses/Inverse Loss'] = [] stats['Policy/Curiosity Reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" policy = PPOPolicy(0, brain_params, dummy_config, False, False) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 agentbuffer = trajectory.to_agentbuffer() batched_values = policy.get_batched_value_estimates(agentbuffer) for values in batched_values.values(): assert len(values) == 15
def create_ppo_policy_mock( mock_env, dummy_config, reward_signal_config, use_rnn, use_discrete, use_visual ): if not use_visual: mock_brain = mb.create_mock_brainparams( vector_action_space_type="discrete" if use_discrete else "continuous", vector_action_space_size=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_observation_space_size=VECTOR_OBS_SPACE, ) mock_braininfo = mb.create_mock_braininfo( num_agents=NUM_AGENTS, num_vector_observations=VECTOR_OBS_SPACE, num_vector_acts=sum( DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE ), discrete=use_discrete, ) else: mock_brain = mb.create_mock_brainparams( vector_action_space_type="discrete" if use_discrete else "continuous", vector_action_space_size=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_observation_space_size=0, number_visual_observations=1, ) mock_braininfo = mb.create_mock_braininfo( num_agents=NUM_AGENTS, num_vis_observations=1, num_vector_acts=sum( DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE ), discrete=use_discrete, ) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() trainer_parameters = dummy_config model_path = env.brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["reward_signals"].update(reward_signal_config) trainer_parameters["use_recurrent"] = use_rnn policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) return env, policy
def create_ppo_policy_with_bc_mock(mock_env, mock_brain, dummy_config, use_rnn, demo_file): mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() trainer_parameters = dummy_config model_path = env.brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["use_recurrent"] = use_rnn trainer_parameters["pretraining"]["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file) policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) return env, policy
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") brain_infos = env.reset() brain_info = brain_infos[env.external_brain_names[0]] trainer_parameters = dummy_config model_path = env.external_brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy(0, env.brains[env.external_brain_names[0]], trainer_parameters, False, False) run_out = policy.evaluate(brain_info) assert run_out["action"].shape == (3, 2) env.close()
def create_policy_with_bc_mock(mock_env, mock_brain, trainer_config, use_rnn, demo_file): mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() model_path = env.external_brain_names[0] trainer_config["model_path"] = model_path trainer_config["keep_checkpoints"] = 3 trainer_config["use_recurrent"] = use_rnn trainer_config["behavioral_cloning"]["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file) policy = (PPOPolicy(0, mock_brain, trainer_config, False, False) if trainer_config["trainer"] == "ppo" else SACPolicy( 0, mock_brain, trainer_config, False, False)) return env, policy
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") env.reset() brain_name = env.get_agent_groups()[0] batched_step = env.get_step_result(brain_name) brain_params = group_spec_to_brain_parameters( brain_name, env.get_agent_group_spec(brain_name)) trainer_parameters = dummy_config model_path = brain_name trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy(0, brain_params, trainer_parameters, False, False) run_out = policy.evaluate(batched_step, list(batched_step.agent_id)) assert run_out["action"].shape == (3, 2) env.close()
def test_ppo_policy_evaluate(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') brain_infos = env.reset() brain_info = brain_infos[env.brain_names[0]] trainer_parameters = dummy_config() graph_scope = env.brain_names[0] trainer_parameters['graph_scope'] = graph_scope policy = PPOPolicy(0, env.brains[env.brain_names[0]], trainer_parameters, sess, False) init = tf.global_variables_initializer() sess.run(init) run_out = policy.evaluate(brain_info) assert run_out['action'].shape == (3, 2) env.close()
def create_policy_mock(trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_mock_brain( use_discrete, use_visual, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_parameters = trainer_config model_path = "testpath" trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["reward_signals"].update(reward_signal_config) trainer_parameters["use_recurrent"] = use_rnn if trainer_config["trainer"] == "ppo": policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) else: policy = SACPolicy(0, mock_brain, trainer_parameters, False, False) return policy
def create_policy_mock(mock_env, trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): env, mock_brain, _ = mb.setup_mock_env_and_brains( mock_env, use_discrete, use_visual, num_agents=NUM_AGENTS, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_parameters = trainer_config model_path = env.external_brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["reward_signals"].update(reward_signal_config) trainer_parameters["use_recurrent"] = use_rnn if trainer_config["trainer"] == "ppo": policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) else: policy = SACPolicy(0, mock_brain, trainer_parameters, False, False) return env, policy
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "gamma", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "use_curiosity", "curiosity_strength", "curiosity_enc_size", "model_path", ] self.check_param_keys() self.use_curiosity = bool(trainer_parameters["use_curiosity"]) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = { "Environment/Cumulative Reward": [], "Environment/Episode Length": [], "Policy/Value Estimate": [], "Policy/Entropy": [], "Losses/Value Loss": [], "Losses/Policy Loss": [], "Policy/Learning Rate": [], } if self.use_curiosity: stats["Losses/Forward Loss"] = [] stats["Losses/Inverse Loss"] = [] stats["Policy/Curiosity Reward"] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {}