def test_reinforce_agent_learning(env_name): """ Extension of the test for an agent playing in the environment to include training. Note: This does not test that training improves the policy. It simply tests that the training loop runs effectively. """ # Set up environment using default parameters. # Environment parameters do not affect the test result here. tf_env, _ = rl_env_from_snc_env(load_scenario( env_name, job_gen_seed=10, override_env_params={'max_episode_length': 25})[1], discount_factor=0.99) # Set up a training step counter. global_step = tf.compat.v1.train.get_or_create_global_step() # Instantiate a REINFORCE agent reinforce_agent = create_reinforce_agent(tf_env, training_step_counter=global_step) # Instantiate a replay buffer. replay_buffer = TFUniformReplayBuffer( data_spec=reinforce_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000) # Initialise the action network weights etc. reinforce_agent.initialize() # Use a driver to handle data collection for the agent. This handles a lot of the backend # TensorFlow set up and solves previous errors with episodes of differing lengths. collect_driver = DynamicEpisodeDriver(tf_env, reinforce_agent.collect_policy, observers=[replay_buffer.add_batch], num_episodes=2) # Get the initial states of the agent and environment before training. time_step = tf_env.reset() policy_state = reinforce_agent.collect_policy.get_initial_state( tf_env.batch_size) # Take a copy of the variables in order to ensure that training does lead to parameter changes. initial_vars = deepcopy(reinforce_agent.trainable_variables) assert len(initial_vars) > 0, "Agent has no trainable variables." # Set up a minimal training loop to simply test training mechanics work. for _ in range(5): # Collect experience. time_step, policy_state = collect_driver.run(time_step=time_step, policy_state=policy_state) # Now the replay buffer should have data in it so we can collect the data and train the # agent. experience = replay_buffer.gather_all() reinforce_agent.train(experience) # Clear the replay buffer and return to play. replay_buffer.clear() # Check that training has had some effect for v1, v2 in zip(initial_vars, reinforce_agent.trainable_variables): assert not np.allclose(v1.numpy(), v2.numpy())
class SyncUniformExperienceReplayer(ExperienceReplayer): """ For synchronous off-policy training. Example algorithms: DDPG, SAC """ def __init__(self, experience_spec, batch_size): self._buffer = TFUniformReplayBuffer(experience_spec, batch_size) self._data_iter = None def observe(self, exp, env_ids=None): """ For the sync driver, `exp` has the shape (`env_batch_size`, ...) with `num_envs`==1 and `unroll_length`==1. This function always ignores `env_ids`. """ self._buffer.add_batch(exp) def replay(self, sample_batch_size, mini_batch_length): if self._data_iter is None: dataset = self._buffer.as_dataset( num_parallel_calls=3, sample_batch_size=sample_batch_size, num_steps=mini_batch_length).prefetch(3) self._data_iter = iter(dataset) return next(self._data_iter) def replay_all(self): return self._buffer.gather_all() def clear(self): self._buffer.clear() @property def batch_size(self): return self._buffer._batch_size
class SyncUniformExperienceReplayer(ExperienceReplayer): """ For synchronous off-policy training. Example algorithms: DDPG, SAC """ def __init__(self, experience_spec, batch_size): # TFUniformReplayBuffer does not support list in spec, we have to do # some conversion. self._experience_spec = experience_spec self._exp_has_list = nest_utils.nest_contains_list(experience_spec) tuple_experience_spec = nest_utils.nest_list_to_tuple(experience_spec) self._buffer = TFUniformReplayBuffer(tuple_experience_spec, batch_size) self._data_iter = None def _list_to_tuple(self, exp): if self._exp_has_list: return nest_utils.nest_list_to_tuple(exp) else: return exp def _tuple_to_list(self, exp): if self._exp_has_list: return nest_utils.nest_tuple_to_list(exp, self._experience_spec) else: return exp def observe(self, exp, env_ids=None): """ For the sync driver, `exp` has the shape (`env_batch_size`, ...) with `num_envs`==1 and `unroll_length`==1. This function always ignores `env_ids`. """ self._buffer.add_batch(self._list_to_tuple(exp)) def replay(self, sample_batch_size, mini_batch_length): """Get a random batch. Args: sample_batch_size (int): number of sequences mini_batch_length (int): the length of each sequence Returns: Experience: experience batch in batch major (B, T, ...) tf_uniform_replay_buffer.BufferInfo: information about the batch """ if self._data_iter is None: dataset = self._buffer.as_dataset( num_parallel_calls=3, sample_batch_size=sample_batch_size, num_steps=mini_batch_length).prefetch(3) self._data_iter = iter(dataset) exp, info = next(self._data_iter) return self._tuple_to_list(exp), info def replay_all(self): return self._tuple_to_list(self._buffer.gather_all()) def clear(self): self._buffer.clear() @property def batch_size(self): return self._buffer._batch_size
def train_agent( env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], data_collection_driver: DynamicEpisodeDriver, replay_buffer: TFUniformReplayBuffer, num_iters: int, global_step=None, metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None, eval_env: Optional[TFPyEnvironment] = None, eval_summary_writer: Optional[tf.summary.SummaryWriter] = None, num_eval_episodes: int = 1, eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None, per_step_eval_metrics: Optional[List[Any]] = None, eval_freq: int = 10, log_freq: int = 5, save_freq: int = 5, model_save_path: Optional[str] = None, tf_log_stream_path: Optional[str] = None) -> None: """ Function for putting the pieces together to train and evaluate an agent. :param env: The environment for which the agent will be trained. :param agent: The agent to train. :param data_collection_driver: The driver used for data collection and metric tracking. :param replay_buffer: Replay buffer in which to store experience. :param num_iters: The number of training iterations to perform. :param global_step: A counter of the number of training iterations. :param metrics: A list of the metrics to track during training. :param policy_metrics: A list of metrics related to the policy distribution to track during training. :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of policy metrics in TensorBoard. :param eval_env: The environment in which to play out evaluations of the policy. :param eval_summary_writer: The summary writer used for evaluation metrics. :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point. :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution). :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step resolution). :param eval_freq: The number of training iterations between runs of policy evaluation logging. :param log_freq: The frequency with which to log values to TensorBoard. :param save_freq: The number of training iterations between model saves. :param model_save_path: Directory in which to save model checkpoints (weights etc). If None model will not be saved. :param tf_log_stream_path: """ # Get the initial states of the agent and environment before training. time_step = env.reset() policy_state = agent.collect_policy.get_initial_state(env.batch_size) # Set up the model saving infrastructure if a path to save to is provided. save_model = bool(model_save_path) if save_model: # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent. checkpoint = tf.train.Checkpoint(agent=agent) # The checkpoint manager enables us to save multiple versions of the check point at # different training steps. We save the 20 most recent saves to span a wide section of # training. checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20) else: # Warn the user that training will continue but models will not be saved. warn("No save directory provided. Model will not be saved.") if metrics is None: metrics = [] if per_step_eval_metrics is None: per_step_eval_metrics = [] # Set up a minimal training loop to simply test training mechanics work. for i in range(num_iters): with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)): # Collect experience. time_step, policy_state = data_collection_driver.run( time_step=time_step, policy_state=policy_state ) # Now the replay buffer should have data in it so we can collect the data and train the # agent. experience = replay_buffer.gather_all() agent.train(experience) # Clear the replay buffer and return to play. replay_buffer.clear() for metric in metrics: metric.tf_summaries( train_step=global_step, step_metrics=metrics[:2] ) # Run the policy tracking metrics one at a time each on their own summary writer to # enable shared axes on TensorBoard. for metric, summary_writer in zip(policy_metrics, policy_summary_writers): with summary_writer.as_default(): tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step) if eval_summary_writer and eval_metrics and eval_env: if i > 0 and global_step % eval_freq == 0: evaluate_policy( eval_metrics, eval_env, agent.policy, per_step_metrics=per_step_eval_metrics, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix="Metrics", logging=True, tf_log_stream_path=tf_log_stream_path ) # Periodically save the model provided that we have the infrastructure in place. if save_model and i > 0 and (i + 1) % save_freq == 0: checkpoint_manager.save(i + 1) if i % (num_iters // 100) == 0: print(f"\tCompleted: {i / num_iters * 100} %") checkpoint_manager.save(num_iters)
def train_implementation(self, train_context: core.TrainContext): """Tf-Agents Reinforce Implementation of the train loop.""" assert isinstance(train_context, core.EpisodesTrainContext) tc: core.EpisodesTrainContext = train_context self.log('Creating environment...') train_env = self._create_env(discount=tc.reward_discount_gamma) observation_spec = train_env.observation_spec() action_spec = train_env.action_spec() timestep_spec = train_env.time_step_spec() # SetUp Optimizer, Networks and PpoAgent self.log_api('AdamOptimizer', 'create') optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=tc.learning_rate) self.log_api('ActorDistributionNetwork', 'create') actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=self.model_config.fc_layers) self.log_api('ReinforceAgent', 'create') tf_agent = reinforce_agent.ReinforceAgent(timestep_spec, action_spec, actor_network=actor_net, optimizer=optimizer) self.log_api('tf_agent.initialize()') tf_agent.initialize() self._trained_policy = tf_agent.policy # SetUp Data collection & Buffering collect_data_spec = tf_agent.collect_data_spec self.log_api('TFUniformReplayBuffer', 'create') replay_buffer = TFUniformReplayBuffer( collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer) self.log_api('DynamicEpisodeDriver', 'create') collect_driver = DynamicEpisodeDriver( train_env, tf_agent.collect_policy, observers=[replay_buffer.add_batch], num_episodes=tc.num_episodes_per_iteration) # Train collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) self.log('Starting training...') while True: self.on_train_iteration_begin() msg = f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4}' self.log_api('collect_driver.run', msg) collect_driver.run() self.log_api('replay_buffer.gather_all', msg) trajectories = replay_buffer.gather_all() self.log_api('tf_agent.train', msg) loss_info = tf_agent.train(experience=trajectories) total_loss = loss_info.loss.numpy() self.log_api('', f'loss={total_loss:<7.1f}') self.log_api('replay_buffer.clear', msg) replay_buffer.clear() self.on_train_iteration_end(loss=total_loss) if tc.training_done: break return
def train_implementation(self, train_context: core.TrainContext): """Tf-Agents Ppo Implementation of the train loop.""" assert isinstance(train_context, core.PpoTrainContext) tc: core.PpoTrainContext = train_context train_env = self._create_env(discount=tc.reward_discount_gamma) observation_spec = train_env.observation_spec() action_spec = train_env.action_spec() timestep_spec = train_env.time_step_spec() # SetUp Optimizer, Networks and PpoAgent self.log_api('AdamOptimizer', '()') optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=tc.learning_rate) self.log_api('ActorDistributionNetwork', '()') actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=self.model_config.fc_layers) self.log_api('ValueNetwork', '()') value_net = value_network.ValueNetwork( observation_spec, fc_layer_params=self.model_config.fc_layers) self.log_api('PpoAgent', '()') tf_agent = ppo_agent.PPOAgent(timestep_spec, action_spec, optimizer, actor_net=actor_net, value_net=value_net, num_epochs=tc.num_epochs_per_iteration) self.log_api('tf_agent.initialize', '()') tf_agent.initialize() self._trained_policy = tf_agent.policy # SetUp Data collection & Buffering collect_data_spec = tf_agent.collect_data_spec self.log_api('TFUniformReplayBuffer', '()') replay_buffer = TFUniformReplayBuffer( collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer) collect_policy = tf_agent.collect_policy self.log_api('DynamicEpisodeDriver', '()') collect_driver = DynamicEpisodeDriver( train_env, collect_policy, observers=[replay_buffer.add_batch], num_episodes=tc.num_episodes_per_iteration) # Train collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) while True: self.on_train_iteration_begin() self.log_api( '-----', f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4} -----' ) self.log_api('collect_driver.run', '()') collect_driver.run() self.log_api('replay_buffer.gather_all', '()') trajectories = replay_buffer.gather_all() self.log_api('tf_agent.train', '(experience=...)') loss_info = tf_agent.train(experience=trajectories) total_loss = loss_info.loss.numpy() actor_loss = loss_info.extra.policy_gradient_loss.numpy() critic_loss = loss_info.extra.value_estimation_loss.numpy() self.log_api( '', f'loss={total_loss:<7.1f} [actor={actor_loss:<7.1f} critic={critic_loss:<7.1f}]' ) self.log_api('replay_buffer.clear', '()') replay_buffer.clear() self.on_train_iteration_end(loss=total_loss, actor_loss=actor_loss, critic_loss=critic_loss) if tc.training_done: break return
class TFAgentsPPOAgent(RLAgent): def __init__(self, name=None, actor_net=None, value_net=None, predictor=None, keep_models_fixed=False, featurizer=None): super().__init__(name, predictor, keep_models_fixed, featurizer) action_spec = BoundedTensorSpec((1, ), tf.int64, 0, ACTION_DIMENSIONS - 1) # we store both mask and the actual observation in the observation # given to the agent in order to get an association between these two # see also https://github.com/tensorflow/agents/issues/125#issuecomment-496583325 observation_spec = { 'state': TensorSpec((self.featurizer.state_dimension(), ), tf.float32), 'mask': TensorSpec((ACTION_DIMENSIONS, ), tf.float32) } layers = equal_spacing_fc(5, self.featurizer.state_dimension()) if actor_net is None: self.actor_net = MaskedActorNetwork(observation_spec, action_spec, layers) else: self.actor_net = actor_net if value_net is None: self.value_net = DummyMaskedValueNetwork(observation_spec, fc_layer_params=layers) else: self.value_net = value_net self.agent = tf_agents.agents.ppo.ppo_agent.PPOAgent( time_step_spec=ts.time_step_spec(observation_spec), action_spec=action_spec, actor_net=self.actor_net, value_net=self.value_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-5), discount_factor=1, use_gae=True, use_td_lambda_return=True, lambda_value=0.85, num_epochs=30, # the observations are dicts { 'state': ..., 'mask': ... } # normalization does not make any sense for the mask normalize_observations=False, ) if actor_net is not None or value_net is not None: self.agent.initialize() else: self._create_train_checkpointer() # All the variables are in fact successfully restored but this is # not done immediately but only once some shapes are known. # Therefore, if the shapes are never known, the variables are not # restored. This is no problem in self play, where all of the shapes # are known after the first training but it is a problem when playing # against old versions because often some of the old versions aren't # used (and also the value net is never used because the old versions # aren't trained). It isn't an error but tensorflow gives warnings at # the end which are confusing if one doesn't know this. # Therefore we silence those warnings with .expect_partial(). # For more information see # https://github.com/tensorflow/tensorflow/issues/27937#issuecomment-484683443 # https://github.com/tensorflow/tensorflow/issues/27937#issuecomment-488356053 self.train_checkpointer.initialize_or_restore().expect_partial() # it seems like there is also agent.policy. I still don't understand when # one should use which and why but this one works for now. self.policy = self.agent.collect_policy # because tf_agents wants the data as trajectories # (prev_time_step, action, new_time_step), we have to store the prev_time_step # until we have the new_time_step to build the trajectory at which point # the new prev_time_step is the new_time_step # this variable is to keep track of the prev_time_step self.last_time_step = None # even though PPO is on policy, storing the stuff for a bit seems to be ok # and the examples in the tf_agents repo also use one self.replay_buffer = TFUniformReplayBuffer( self.agent.collect_data_spec, batch_size=1, max_length=REPLAY_BUFFER_SIZE) self.replay_buffer_position = 0 self.clone_counter = 0 def _create_train_checkpointer(self): self.train_checkpointer = tf_agents.utils.common.Checkpointer( ckpt_dir=os.path.join(MODELS_PATH, self.name, 'Agent'), agent=self.agent) def _add_trajectory(self, prev_time_step, action, new_time_step): """Add a trajectory (prev_time_step, action, new_time_step) to the replay buffer Also train the agent on the whole buffer if it is full. """ traj = tf_agents.trajectories.trajectory.from_transition( prev_time_step, action, new_time_step) self.replay_buffer.add_batch(traj) self.replay_buffer_position += 1 if self.replay_buffer_position == REPLAY_BUFFER_SIZE + 1: if not self.keep_models_fixed: self.agent.train(self.replay_buffer.gather_all()) self.replay_buffer_position = 0 self.replay_buffer.clear() def act(self, observation, valid_action_mask): observation = { 'state': np.array(observation, dtype=np.float32), 'mask': valid_action_mask } if self.last_time_step is None: # a new episode started self.last_time_step = _to_tf_timestep(ts.restart(observation)) self.last_action_step = self.policy.action(self.last_time_step) return self.last_action_step.action.numpy()[0, 0] new_time_step = _to_tf_timestep( ts.transition(observation, self.prev_reward)) self._add_trajectory(self.last_time_step, self.last_action_step, new_time_step) self.last_time_step = new_time_step self.last_action_step = self.policy.action(new_time_step) self.prev_reward = None return self.last_action_step.action.numpy()[0, 0] def observe(self, reward, terminal): if not terminal: self.prev_reward = reward return # even when the episode ends, tf_agents expects some observation # additionally to the reward. Because that makes no sense for us, # we just give it an observation consisting of all-zeros new_time_step = _to_tf_timestep( ts.termination( { 'state': np.zeros(self.featurizer.state_dimension()), 'mask': np.zeros(ACTION_DIMENSIONS) }, reward)) self._add_trajectory(self.last_time_step, self.last_action_step, new_time_step) self.last_time_step = None self.last_action_step = None self.prev_reward = None def clone(self, name=None): """Return a clone of this agent with networks & predictor shared""" if name is None: self.clone_counter += 1 name = self.name + 'Clone' + str(self.clone_counter) return TFAgentsPPOAgent(name=name, actor_net=self.actor_net, value_net=self.value_net, predictor=self.predictor, keep_models_fixed=self.keep_models_fixed, featurizer=self.featurizer) def save_models(self): """Save actor, critic and predictor Args: global_step: the current game number, is appended to the filenames of the saved models """ if self.keep_models_fixed: return super().save_models(os.path.join(MODELS_PATH, self.name)) if not hasattr(self, 'train_checkpointer'): self._create_train_checkpointer() self.train_checkpointer.save(0)