예제 #1
0
 def create_evaluation_metrics() -> List[TFStepMetric]:
     """
     Create a list of metrics to capture during policy evaluation.
     """
     return [
         AverageReturnMetric(buffer_size=1),
         AverageEpisodeLengthMetric(buffer_size=1),
     ]
예제 #2
0
 def create_train_metrics() -> List[TFStepMetric]:
     """
     Create a list of metrics to capture during training.
     """
     return [
         NumberOfEpisodes(),
         EnvironmentSteps(),
         AverageReturnMetric(buffer_size=1),
         AverageEpisodeLengthMetric(buffer_size=1),
     ]
def main():

    env = suite_gym.load('Trajectory-v0',
                         gym_kwargs={
                             'num_dimensions': 2,
                             'num_observables': 15,
                             'max_targets': 100,
                             'max_steps': 5000,
                             'max_steps_without_target': 5000,
                             'max_position': 100.0,
                             'max_acceleration': 10.2,
                             'max_velocity': 15.0,
                             'collision_epsilon': 10.0
                         })
    tf_env = tf_py_environment.TFPyEnvironment(env)

    agent = RandomAgent(time_step_spec=tf_env.time_step_spec(),
                        action_spec=tf_env.action_spec())

    metric = AverageReturnMetric()
    replay_buffer = []
    # uniform_replay_buffer = PyUniformReplayBuffer(data_spec=agent.collect_data_spec, capacity=2000)
    uniform_replay_buffer = TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec, batch_size=1)
    # observers = [replay_buffer.append, metric]

    # driver = PyDriver(
    #     env,
    #     policy=RandomPyPolicy(env.time_step_spec(), env.action_spec()),
    #     observers=[replay_buffer.append, metric],
    #     max_steps=2000
    # )

    # driver = TFDriver(
    #     tf_env,
    #     # policy=RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()),
    #     policy=agent.policy,
    #     observers=[uniform_replay_buffer],
    #     max_steps=2000
    # )

    driver = DynamicStepDriver(
        tf_env,
        policy=agent.policy,
        observers=[uniform_replay_buffer.add_batch],  #, metric],
        # transition_observers=None,
        num_steps=1000)

    agent.initialize()
    initial_time_step = tf_env.reset()
    final_time_step, final_policy_state = driver.run(initial_time_step)

    dataset = uniform_replay_buffer.as_dataset()
예제 #4
0
    def test_mario_env(self):
        ctor = lambda: suite_mario.load(
            'SuperMarioBros-Nes', 'Level1-1', wrap_with_process=False)

        self._env = parallel_py_environment.ParallelPyEnvironment([ctor] * 4)
        env = tf_py_environment.TFPyEnvironment(self._env)
        self.assertEqual(np.uint8, env.observation_spec().dtype)
        self.assertEqual((84, 84, 4), env.observation_spec().shape)

        random_policy = random_tf_policy.RandomTFPolicy(
            env.time_step_spec(), env.action_spec())

        metrics = [
            AverageReturnMetric(batch_size=4),
            AverageEpisodeLengthMetric(batch_size=4),
            EnvironmentSteps(),
            NumberOfEpisodes()
        ]
        driver = dynamic_step_driver.DynamicStepDriver(env, random_policy,
                                                       metrics, 10000)
        driver.run(maximum_iterations=10000)
예제 #5
0
# 4. Constructing the Replay Memory.
memory_size = 20000
batch_size = 64

replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=memory_size
)

# Initializing Observer of replay buffer to store experiences (trajectories) to memory.
replay_buffer_observer = replay_buffer.add_batch

# Defining Metrics for measuring training progress.
train_metrics = [ AverageReturnMetric(), AverageEpisodeLengthMetric() ]

# 5. Defining initial policy as random to collect enough examples to fill the memory buffer (Training delay).
initial_collect_policy = random_tf_policy.RandomTFPolicy( train_env.time_step_spec(), train_env.action_spec() )
initial_collect_steps = 2000


class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total

    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 1000 == 0:
예제 #6
0
    policy = agent.policy

    replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec,
                                          batch_size=env.batch_size,
                                          max_length=replay_buffer_capacity)

    agent.train_step_counter.assign(0)

    replay_observer = [replay_buffer.add_batch]
    with strategy.scope():
        driver = TFDriver(env,
                          collect_policy,
                          replay_observer,
                          max_episodes=100)

    average = AverageReturnMetric()
    metrics_observer = [average]
    metrics_driver = TFRenderDriver(env,
                                    policy,
                                    metrics_observer,
                                    max_episodes=10)

    def experience_fn():
        with strategy.scope():
            return replay_buffer.as_dataset(num_parallel_calls=3,
                                            sample_batch_size=64).prefetch(3)

    learner = Learner('/tmp/models/test/behavior_cloning',
                      train_step_counter,
                      agent,
                      experience_fn,