def test_reinforce_agent_learning(env_name): """ Extension of the test for an agent playing in the environment to include training. Note: This does not test that training improves the policy. It simply tests that the training loop runs effectively. """ # Set up environment using default parameters. # Environment parameters do not affect the test result here. tf_env, _ = rl_env_from_snc_env(load_scenario( env_name, job_gen_seed=10, override_env_params={'max_episode_length': 25})[1], discount_factor=0.99) # Set up a training step counter. global_step = tf.compat.v1.train.get_or_create_global_step() # Instantiate a REINFORCE agent reinforce_agent = create_reinforce_agent(tf_env, training_step_counter=global_step) # Instantiate a replay buffer. replay_buffer = TFUniformReplayBuffer( data_spec=reinforce_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000) # Initialise the action network weights etc. reinforce_agent.initialize() # Use a driver to handle data collection for the agent. This handles a lot of the backend # TensorFlow set up and solves previous errors with episodes of differing lengths. collect_driver = DynamicEpisodeDriver(tf_env, reinforce_agent.collect_policy, observers=[replay_buffer.add_batch], num_episodes=2) # Get the initial states of the agent and environment before training. time_step = tf_env.reset() policy_state = reinforce_agent.collect_policy.get_initial_state( tf_env.batch_size) # Take a copy of the variables in order to ensure that training does lead to parameter changes. initial_vars = deepcopy(reinforce_agent.trainable_variables) assert len(initial_vars) > 0, "Agent has no trainable variables." # Set up a minimal training loop to simply test training mechanics work. for _ in range(5): # Collect experience. time_step, policy_state = collect_driver.run(time_step=time_step, policy_state=policy_state) # Now the replay buffer should have data in it so we can collect the data and train the # agent. experience = replay_buffer.gather_all() reinforce_agent.train(experience) # Clear the replay buffer and return to play. replay_buffer.clear() # Check that training has had some effect for v1, v2 in zip(initial_vars, reinforce_agent.trainable_variables): assert not np.allclose(v1.numpy(), v2.numpy())
def get_collection_driver( env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], observers: List[Any], policy_observers: Optional[List[tf_metric.TFStepMetric]], num_episodes: int ) -> DynamicEpisodeDriver: """ Sets up a driver which will run data collection and in-training metric tracking. The driver is defined in tf_agents and handles agent play and monitoring as well as data storage for a fixed number of episodes at a time. This driver will be run to collect data once per training iteration. :param env: The TensorFlow environment object which will be run. :param agent: The agent to play in the environment. :param observers: A list of operations (including metrics to track) which will be executed in play to collect data and perform logging. :param policy_observers: A list of metrics to track which are executed in play throughout training. :param num_episodes: The number of episodes to play out in each driver run. :return: A driver to use for data collection (and in-play performance tracking) """ collection_driver = DynamicEpisodeDriver( env, agent.collect_policy, observers=observers + policy_observers, num_episodes=num_episodes ) # Wrap the run function for faster execution. collection_driver.run = tf.function(collection_driver.run) return collection_driver
def test_planning_policy_batch_environment_model(): """ Ensure that planning policy is operational. """ # number of trajectories for planning and planning horizon population_size = 3 planner_horizon = 5 number_of_particles = 1 # setup the environment and a model of it py_env = suite_gym.load("MountainCar-v0") tf_env = TFPyEnvironment(py_env) reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec()) terminates = MountainCarTermination(tf_env.observation_spec()) network = LinearTransitionNetwork(tf_env.observation_spec()) transition_model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) initial_state = MountainCarInitialState(tf_env.observation_spec()) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=terminates, initial_state_distribution_model=initial_state, ) # setup the trajectory optimiser random_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy, planner_horizon, population_size, number_of_particles) planning_policy = PlanningPolicy(environment_model, trajectory_optimiser) # test whether it runs collect_driver_planning_policy = DynamicEpisodeDriver(tf_env, planning_policy, num_episodes=1) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
def build_driver(self): """Build elements of the data pipeline.""" observers = [self.replay_buffer.add_batch] driver = DynamicEpisodeDriver( env=self.tf_env, policy=self.agent.collect_policy, observers=observers) dataset = self.replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=self.batch_size, num_steps=self.agent.train_sequence_length ).prefetch(3) iterator = iter(dataset) return driver, iterator
def train_agent( env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], data_collection_driver: DynamicEpisodeDriver, replay_buffer: TFUniformReplayBuffer, num_iters: int, global_step=None, metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None, eval_env: Optional[TFPyEnvironment] = None, eval_summary_writer: Optional[tf.summary.SummaryWriter] = None, num_eval_episodes: int = 1, eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None, per_step_eval_metrics: Optional[List[Any]] = None, eval_freq: int = 10, log_freq: int = 5, save_freq: int = 5, model_save_path: Optional[str] = None, tf_log_stream_path: Optional[str] = None) -> None: """ Function for putting the pieces together to train and evaluate an agent. :param env: The environment for which the agent will be trained. :param agent: The agent to train. :param data_collection_driver: The driver used for data collection and metric tracking. :param replay_buffer: Replay buffer in which to store experience. :param num_iters: The number of training iterations to perform. :param global_step: A counter of the number of training iterations. :param metrics: A list of the metrics to track during training. :param policy_metrics: A list of metrics related to the policy distribution to track during training. :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of policy metrics in TensorBoard. :param eval_env: The environment in which to play out evaluations of the policy. :param eval_summary_writer: The summary writer used for evaluation metrics. :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point. :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution). :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step resolution). :param eval_freq: The number of training iterations between runs of policy evaluation logging. :param log_freq: The frequency with which to log values to TensorBoard. :param save_freq: The number of training iterations between model saves. :param model_save_path: Directory in which to save model checkpoints (weights etc). If None model will not be saved. :param tf_log_stream_path: """ # Get the initial states of the agent and environment before training. time_step = env.reset() policy_state = agent.collect_policy.get_initial_state(env.batch_size) # Set up the model saving infrastructure if a path to save to is provided. save_model = bool(model_save_path) if save_model: # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent. checkpoint = tf.train.Checkpoint(agent=agent) # The checkpoint manager enables us to save multiple versions of the check point at # different training steps. We save the 20 most recent saves to span a wide section of # training. checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20) else: # Warn the user that training will continue but models will not be saved. warn("No save directory provided. Model will not be saved.") if metrics is None: metrics = [] if per_step_eval_metrics is None: per_step_eval_metrics = [] # Set up a minimal training loop to simply test training mechanics work. for i in range(num_iters): with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)): # Collect experience. time_step, policy_state = data_collection_driver.run( time_step=time_step, policy_state=policy_state ) # Now the replay buffer should have data in it so we can collect the data and train the # agent. experience = replay_buffer.gather_all() agent.train(experience) # Clear the replay buffer and return to play. replay_buffer.clear() for metric in metrics: metric.tf_summaries( train_step=global_step, step_metrics=metrics[:2] ) # Run the policy tracking metrics one at a time each on their own summary writer to # enable shared axes on TensorBoard. for metric, summary_writer in zip(policy_metrics, policy_summary_writers): with summary_writer.as_default(): tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step) if eval_summary_writer and eval_metrics and eval_env: if i > 0 and global_step % eval_freq == 0: evaluate_policy( eval_metrics, eval_env, agent.policy, per_step_metrics=per_step_eval_metrics, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix="Metrics", logging=True, tf_log_stream_path=tf_log_stream_path ) # Periodically save the model provided that we have the infrastructure in place. if save_model and i > 0 and (i + 1) % save_freq == 0: checkpoint_manager.save(i + 1) if i % (num_iters // 100) == 0: print(f"\tCompleted: {i / num_iters * 100} %") checkpoint_manager.save(num_iters)
def evaluate_policy(metrics: List[Any], environment: TFPyEnvironment, policy: tf_agents.policies.tf_policy.Base, per_step_metrics: Optional[List[tf.Module]] = None, num_episodes: int = 1, train_step: Optional[Any] = None, summary_writer: Optional[tf.summary.SummaryWriter] = None, summary_prefix: str = "Eval", logging: bool = False, tf_log_stream_path: Optional[str] = None) -> None: """ Track performance (via metrics) using policy in the environment provided. Prints a dictionary of results {metric_name: metric_value}. *NOTE*: Because placeholders are not compatible with Eager mode this is not compatible with python policies. This function is adapted from tf_agents.eval.metric_utils.eager_compute to allow for per time step logging. :param metrics: List of metrics to compute. :param environment: tf_environment instance. :param policy: tf_policy instance used to step the environment. :param per_step_metrics: List of metrics to be passed as observers to run every time step during evaluation. :param num_episodes: Number of episodes to compute the metrics over. :param train_step: An optional step to write summaries against. :param summary_writer: An optional writer for generating metric summaries. :param summary_prefix: An optional prefix scope for metric summaries. :param logging: Option to enable logging to the console of standard metrics. :param tf_log_stream_path: Path to a file which tf.print calls are set to write to. If none tf.print statements print to sys.stdout. """ # Reset the state of all metrics (e.g. running totals for averages). for metric in metrics + per_step_metrics: metric.reset() # Attain the initial state of the environment and policy. time_step = environment.reset() policy_state = policy.get_initial_state(environment.batch_size) # Set up a driver to run the evaluation episodes while logging the desired metrics. driver = DynamicEpisodeDriver( environment, policy, observers=metrics, transition_observers=per_step_metrics, num_episodes=num_episodes) # Run the driver which adds experience to the replay buffer. driver.run(time_step, policy_state) # If we have the required prerequisites then perform the TensorBoard logging as well as logging # results to the console. if train_step and summary_writer: # Utilise a (possibly) different summary writer to put the evaluation metrics to # TensorBoard. with summary_writer.as_default(): for m in metrics: # Attain the full name of the metric to record. tag = "/".join([summary_prefix, m.name]) # Simply calculating and forming the scalar summary in the current context with a # default summary writer does the logging to TensorBoard for us. tf.summary.scalar(name=tag, data=m.result(), step=train_step) # If requested to then log metrics to the console. if logging and train_step: for m in metrics: tf.print(f"Evaluation at step {train_step.numpy()}: {m.name}\t{m.result()}", output_stream=f'file://{tf_log_stream_path}' if tf_log_stream_path else sys.stdout)
def train_implementation(self, train_context: core.TrainContext): """Tf-Agents Reinforce Implementation of the train loop.""" assert isinstance(train_context, core.EpisodesTrainContext) tc: core.EpisodesTrainContext = train_context self.log('Creating environment...') train_env = self._create_env(discount=tc.reward_discount_gamma) observation_spec = train_env.observation_spec() action_spec = train_env.action_spec() timestep_spec = train_env.time_step_spec() # SetUp Optimizer, Networks and PpoAgent self.log_api('AdamOptimizer', 'create') optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=tc.learning_rate) self.log_api('ActorDistributionNetwork', 'create') actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=self.model_config.fc_layers) self.log_api('ReinforceAgent', 'create') tf_agent = reinforce_agent.ReinforceAgent(timestep_spec, action_spec, actor_network=actor_net, optimizer=optimizer) self.log_api('tf_agent.initialize()') tf_agent.initialize() self._trained_policy = tf_agent.policy # SetUp Data collection & Buffering collect_data_spec = tf_agent.collect_data_spec self.log_api('TFUniformReplayBuffer', 'create') replay_buffer = TFUniformReplayBuffer( collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer) self.log_api('DynamicEpisodeDriver', 'create') collect_driver = DynamicEpisodeDriver( train_env, tf_agent.collect_policy, observers=[replay_buffer.add_batch], num_episodes=tc.num_episodes_per_iteration) # Train collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) self.log('Starting training...') while True: self.on_train_iteration_begin() msg = f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4}' self.log_api('collect_driver.run', msg) collect_driver.run() self.log_api('replay_buffer.gather_all', msg) trajectories = replay_buffer.gather_all() self.log_api('tf_agent.train', msg) loss_info = tf_agent.train(experience=trajectories) total_loss = loss_info.loss.numpy() self.log_api('', f'loss={total_loss:<7.1f}') self.log_api('replay_buffer.clear', msg) replay_buffer.clear() self.on_train_iteration_end(loss=total_loss) if tc.training_done: break return
def train_implementation(self, train_context: core.TrainContext): """Tf-Agents Ppo Implementation of the train loop.""" assert isinstance(train_context, core.PpoTrainContext) tc: core.PpoTrainContext = train_context train_env = self._create_env(discount=tc.reward_discount_gamma) observation_spec = train_env.observation_spec() action_spec = train_env.action_spec() timestep_spec = train_env.time_step_spec() # SetUp Optimizer, Networks and PpoAgent self.log_api('AdamOptimizer', '()') optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=tc.learning_rate) self.log_api('ActorDistributionNetwork', '()') actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=self.model_config.fc_layers) self.log_api('ValueNetwork', '()') value_net = value_network.ValueNetwork( observation_spec, fc_layer_params=self.model_config.fc_layers) self.log_api('PpoAgent', '()') tf_agent = ppo_agent.PPOAgent(timestep_spec, action_spec, optimizer, actor_net=actor_net, value_net=value_net, num_epochs=tc.num_epochs_per_iteration) self.log_api('tf_agent.initialize', '()') tf_agent.initialize() self._trained_policy = tf_agent.policy # SetUp Data collection & Buffering collect_data_spec = tf_agent.collect_data_spec self.log_api('TFUniformReplayBuffer', '()') replay_buffer = TFUniformReplayBuffer( collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer) collect_policy = tf_agent.collect_policy self.log_api('DynamicEpisodeDriver', '()') collect_driver = DynamicEpisodeDriver( train_env, collect_policy, observers=[replay_buffer.add_batch], num_episodes=tc.num_episodes_per_iteration) # Train collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) while True: self.on_train_iteration_begin() self.log_api( '-----', f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4} -----' ) self.log_api('collect_driver.run', '()') collect_driver.run() self.log_api('replay_buffer.gather_all', '()') trajectories = replay_buffer.gather_all() self.log_api('tf_agent.train', '(experience=...)') loss_info = tf_agent.train(experience=trajectories) total_loss = loss_info.loss.numpy() actor_loss = loss_info.extra.policy_gradient_loss.numpy() critic_loss = loss_info.extra.value_estimation_loss.numpy() self.log_api( '', f'loss={total_loss:<7.1f} [actor={actor_loss:<7.1f} critic={critic_loss:<7.1f}]' ) self.log_api('replay_buffer.clear', '()') replay_buffer.clear() self.on_train_iteration_end(loss=total_loss, actor_loss=actor_loss, critic_loss=critic_loss) if tc.training_done: break return