def create_evaluation_metrics() -> List[TFStepMetric]: """ Create a list of metrics to capture during policy evaluation. """ return [ AverageReturnMetric(buffer_size=1), AverageEpisodeLengthMetric(buffer_size=1), ]
def create_train_metrics() -> List[TFStepMetric]: """ Create a list of metrics to capture during training. """ return [ NumberOfEpisodes(), EnvironmentSteps(), AverageReturnMetric(buffer_size=1), AverageEpisodeLengthMetric(buffer_size=1), ]
def main(): env = suite_gym.load('Trajectory-v0', gym_kwargs={ 'num_dimensions': 2, 'num_observables': 15, 'max_targets': 100, 'max_steps': 5000, 'max_steps_without_target': 5000, 'max_position': 100.0, 'max_acceleration': 10.2, 'max_velocity': 15.0, 'collision_epsilon': 10.0 }) tf_env = tf_py_environment.TFPyEnvironment(env) agent = RandomAgent(time_step_spec=tf_env.time_step_spec(), action_spec=tf_env.action_spec()) metric = AverageReturnMetric() replay_buffer = [] # uniform_replay_buffer = PyUniformReplayBuffer(data_spec=agent.collect_data_spec, capacity=2000) uniform_replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=1) # observers = [replay_buffer.append, metric] # driver = PyDriver( # env, # policy=RandomPyPolicy(env.time_step_spec(), env.action_spec()), # observers=[replay_buffer.append, metric], # max_steps=2000 # ) # driver = TFDriver( # tf_env, # # policy=RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()), # policy=agent.policy, # observers=[uniform_replay_buffer], # max_steps=2000 # ) driver = DynamicStepDriver( tf_env, policy=agent.policy, observers=[uniform_replay_buffer.add_batch], #, metric], # transition_observers=None, num_steps=1000) agent.initialize() initial_time_step = tf_env.reset() final_time_step, final_policy_state = driver.run(initial_time_step) dataset = uniform_replay_buffer.as_dataset()
def test_mario_env(self): ctor = lambda: suite_mario.load( 'SuperMarioBros-Nes', 'Level1-1', wrap_with_process=False) self._env = parallel_py_environment.ParallelPyEnvironment([ctor] * 4) env = tf_py_environment.TFPyEnvironment(self._env) self.assertEqual(np.uint8, env.observation_spec().dtype) self.assertEqual((84, 84, 4), env.observation_spec().shape) random_policy = random_tf_policy.RandomTFPolicy( env.time_step_spec(), env.action_spec()) metrics = [ AverageReturnMetric(batch_size=4), AverageEpisodeLengthMetric(batch_size=4), EnvironmentSteps(), NumberOfEpisodes() ] driver = dynamic_step_driver.DynamicStepDriver(env, random_policy, metrics, 10000) driver.run(maximum_iterations=10000)
# 4. Constructing the Replay Memory. memory_size = 20000 batch_size = 64 replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=memory_size ) # Initializing Observer of replay buffer to store experiences (trajectories) to memory. replay_buffer_observer = replay_buffer.add_batch # Defining Metrics for measuring training progress. train_metrics = [ AverageReturnMetric(), AverageEpisodeLengthMetric() ] # 5. Defining initial policy as random to collect enough examples to fill the memory buffer (Training delay). initial_collect_policy = random_tf_policy.RandomTFPolicy( train_env.time_step_spec(), train_env.action_spec() ) initial_collect_steps = 2000 class ShowProgress: def __init__(self, total): self.counter = 0 self.total = total def __call__(self, trajectory): if not trajectory.is_boundary(): self.counter += 1 if self.counter % 1000 == 0:
policy = agent.policy replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec, batch_size=env.batch_size, max_length=replay_buffer_capacity) agent.train_step_counter.assign(0) replay_observer = [replay_buffer.add_batch] with strategy.scope(): driver = TFDriver(env, collect_policy, replay_observer, max_episodes=100) average = AverageReturnMetric() metrics_observer = [average] metrics_driver = TFRenderDriver(env, policy, metrics_observer, max_episodes=10) def experience_fn(): with strategy.scope(): return replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=64).prefetch(3) learner = Learner('/tmp/models/test/behavior_cloning', train_step_counter, agent, experience_fn,