def test_episodic_observer_assert_sequence_length_positive(self): with self.assertRaises(ValueError): _ = reverb_utils.ReverbAddEpisodeObserver( self._client, table_name='test_table', max_sequence_length=-1, priority=3)
def test_episodic_observer_update_priority(self): observer = reverb_utils.ReverbAddEpisodeObserver( self._client, table_name='test_table', max_sequence_length=1, priority=3) self.assertEqual(observer._priority, 3) observer.update_priority(4) self.assertEqual(observer._priority, 4)
def train_agent(iterations, modeldir, logdir, policydir): """Train and convert the model using TF Agents.""" # TODO: add code to instantiate the training and evaluation environments # TODO: add code to create a reinforcement learning agent that is going to be trained tf_agent.initialize() eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy tf_policy_saver = policy_saver.PolicySaver(collect_policy) # Use reverb as replay buffer replay_buffer_signature = tensor_spec.from_spec(tf_agent.collect_data_spec) table = reverb.Table( REPLAY_BUFFER_TABLE_NAME, max_size=REPLAY_BUFFER_CAPACITY, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), signature=replay_buffer_signature, ) # specify signature here for validation at insertion time reverb_server = reverb.Server([table]) replay_buffer = reverb_replay_buffer.ReverbReplayBuffer( tf_agent.collect_data_spec, sequence_length=None, table_name=REPLAY_BUFFER_TABLE_NAME, local_server=reverb_server, ) replay_buffer_observer = reverb_utils.ReverbAddEpisodeObserver( replay_buffer.py_client, REPLAY_BUFFER_TABLE_NAME, REPLAY_BUFFER_CAPACITY) # Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) # Evaluate the agent's policy once before training. avg_return = compute_avg_return_and_steps(eval_env, tf_agent.policy, NUM_EVAL_EPISODES) summary_writer = tf.summary.create_file_writer(logdir) for i in range(iterations): # TODO: add code to collect game episodes and train the agent logger = tf.get_logger() if i % EVAL_INTERVAL == 0: avg_return, avg_episode_length = compute_avg_return_and_steps( eval_env, eval_policy, NUM_EVAL_EPISODES) with summary_writer.as_default(): tf.summary.scalar("Average return", avg_return, step=i) tf.summary.scalar("Average episode length", avg_episode_length, step=i) summary_writer.flush() logger.info( "iteration = {0}: Average Return = {1}, Average Episode Length = {2}" .format(i, avg_return, avg_episode_length)) summary_writer.close() tf_policy_saver.save(policydir)
max_size=replay_buffer_capacity, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), signature=replay_buffer_signature) reverb_server = reverb.Server([table]) replay_buffer = reverb_replay_buffer.ReverbReplayBuffer( tf_agent.collect_data_spec, table_name=table_name, sequence_length=None, local_server=reverb_server) rb_observer = reverb_utils.ReverbAddEpisodeObserver(replay_buffer.py_client, table_name, replay_buffer_capacity) def collect_episode(environment, policy, num_episodes): driver = py_driver.PyDriver(environment, py_tf_eager_policy.PyTFEagerPolicy( policy, use_tf_function=True), [rb_observer], max_episodes=num_episodes) initial_time_step = environment.reset() driver.run(initial_time_step) # (Optional) Optimize by wrapping some of the code in a graph using TF function.
def _create_and_yield(client): yield reverb_utils.ReverbAddEpisodeObserver(client, *args, **kwargs)
def train_agent(iterations, modeldir, logdir, policydir): """Train and convert the model using TF Agents.""" train_py_env = planestrike_py_environment.PlaneStrikePyEnvironment( board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2) eval_py_env = planestrike_py_environment.PlaneStrikePyEnvironment( board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Alternatively you could use ActorDistributionNetwork as actor_net actor_net = tfa.networks.Sequential( [ tfa.keras_layers.InnerReshape([BOARD_SIZE, BOARD_SIZE], [BOARD_SIZE**2]), tf.keras.layers.Dense(FC_LAYER_PARAMS, activation='relu'), tf.keras.layers.Dense(BOARD_SIZE**2), tf.keras.layers.Lambda( lambda t: tfp.distributions.Categorical(logits=t)), ], input_spec=train_py_env.observation_spec()) optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) train_step_counter = tf.Variable(0) tf_agent = reinforce_agent.ReinforceAgent( train_env.time_step_spec(), train_env.action_spec(), actor_network=actor_net, optimizer=optimizer, normalize_returns=True, train_step_counter=train_step_counter) tf_agent.initialize() eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy tf_policy_saver = policy_saver.PolicySaver(collect_policy) # Use reverb as replay buffer replay_buffer_signature = tensor_spec.from_spec(tf_agent.collect_data_spec) table = reverb.Table( REPLAY_BUFFER_TABLE_NAME, max_size=REPLAY_BUFFER_CAPACITY, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), signature=replay_buffer_signature ) # specify signature here for validation at insertion time reverb_server = reverb.Server([table]) replay_buffer = reverb_replay_buffer.ReverbReplayBuffer( tf_agent.collect_data_spec, sequence_length=None, table_name=REPLAY_BUFFER_TABLE_NAME, local_server=reverb_server) replay_buffer_observer = reverb_utils.ReverbAddEpisodeObserver( replay_buffer.py_client, REPLAY_BUFFER_TABLE_NAME, REPLAY_BUFFER_CAPACITY) # Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) # Evaluate the agent's policy once before training. avg_return = compute_avg_return_and_steps(eval_env, tf_agent.policy, NUM_EVAL_EPISODES) summary_writer = tf.summary.create_file_writer(logdir) for i in range(iterations): # Collect a few episodes using collect_policy and save to the replay buffer. collect_episode(train_py_env, collect_policy, COLLECT_EPISODES_PER_ITERATION, replay_buffer_observer) # Use data from the buffer and update the agent's network. iterator = iter(replay_buffer.as_dataset(sample_batch_size=1)) trajectories, _ = next(iterator) tf_agent.train(experience=trajectories) replay_buffer.clear() logger = tf.get_logger() if i % EVAL_INTERVAL == 0: avg_return, avg_episode_length = compute_avg_return_and_steps( eval_env, eval_policy, NUM_EVAL_EPISODES) with summary_writer.as_default(): tf.summary.scalar('Average return', avg_return, step=i) tf.summary.scalar('Average episode length', avg_episode_length, step=i) summary_writer.flush() logger.info( 'iteration = {0}: Average Return = {1}, Average Episode Length = {2}' .format(i, avg_return, avg_episode_length)) summary_writer.close() tf_policy_saver.save(policydir) # Convert to tflite model converter = tf.lite.TFLiteConverter.from_saved_model( policydir, signature_keys=['action']) converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops. tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops. ] tflite_policy = converter.convert() with open(os.path.join(modeldir, 'planestrike_tf_agents.tflite'), 'wb') as f: f.write(tflite_policy)