예제 #1
0
def test_tf_environment_with_random(n_episodes=20):
  """Test tf environment through random actions."""
  print(f'Testing tf environment over {n_episodes} episodes.')
  env = LakeMonsterEnvironment(**params)
  env = TFPyEnvironment(env)
  policy = RandomTFPolicy(time_step_spec=env.time_step_spec(),
                          action_spec=env.action_spec())

  ts = env.reset()
  rewards = []
  n_steps = []

  for _ in tqdm(range(n_episodes)):
    n_step = 0
    while not ts.is_last():
      action = policy.action(ts).action
      ts = env.step(action)
      n_step += 1

    reward = ts.reward
    rewards.append(reward)
    n_steps.append(n_step)
    ts = env.reset()

  # print results
  print('average num of steps per episode:', np.mean(n_steps))
  print('average reward per episode', np.mean(rewards))
예제 #2
0
        def __init__(
            self,
            ts_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            env_model: EnvironmentModel,
        ):
            super().__init__(ts_spec, action_spec)

            self._internal_policy = RandomTFPolicy(ts_spec, action_space)

            self._environment_model = env_model
예제 #3
0
    class WrappedRandomTFPolicy(TFPolicy):
        def __init__(
            self,
            ts_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            env_model: EnvironmentModel,
        ):
            super().__init__(ts_spec, action_spec)

            self._internal_policy = RandomTFPolicy(ts_spec, action_space)

            self._environment_model = env_model

        def _action(
            self,
            time_step: ts.TimeStep,
            policy_state: types.NestedTensor,
            seed: Optional[types.Seed],
        ) -> policy_step.PolicyStep:
            np.testing.assert_array_equal(
                time_step.observation,
                self._environment_model.current_time_step().observation)
            return self._internal_policy._action(time_step, policy_state, seed)

        def _distribution(
                self, time_step: ts.TimeStep,
                policy_state: types.NestedTensorSpec
        ) -> policy_step.PolicyStep:
            raise NotImplementedError()
예제 #4
0
def test_generate_virtual_rollouts(observation_space, action_space, batch_size,
                                   horizon):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    random_policy = RandomTFPolicy(time_step_spec(observation_space),
                                   action_space)

    replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver(
        env_model, random_policy, horizon)

    driver.run(wrapped_env_model.reset())
    trajectory = replay_buffer.gather_all()

    mid_steps = repeat(1, horizon - 1)
    expected_step_types = tf.constant(list(chain([0], mid_steps, [2])))
    batched_step_types = replicate(expected_step_types, (batch_size, ))
    np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
예제 #5
0
def test_get_batch_of_actions(observation_space, action_space, batch_size):
    policy = RandomTFPolicy(time_step_spec(observation_space), action_space)

    samples = sample_uniformly_distributed_observations_and_get_actions(policy, batch_size)

    for i in range(batch_size):
        assert action_space.is_compatible_with(samples["actions"][i, ...])
예제 #6
0
def test_incorrect_termination_model():
    """
    The generic model-based agent should only allow a ConstantFalseTermination model.
    """

    # setup arguments for the model-based agent constructor
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    network = LinearTransitionNetwork(observation_spec)
    transition_model = KerasTransitionModel([network], observation_spec, action_spec)
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)
    termination_model = MountainCarTermination(observation_spec)
    policy = RandomTFPolicy(time_step_spec, action_spec)

    with pytest.raises(AssertionError) as excinfo:
        ModelBasedAgent(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            termination_model,
            initial_state_distribution_model,
            policy,
            policy,
        )

    assert "Only constant false termination supported" in str(excinfo.value)
예제 #7
0
def assert_rollouts_are_close_to_actuals(model, max_steps):
    tf_env = tf_py_environment.TFPyEnvironment(
        create_pendulum_environment(max_steps))
    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())

    test_trajectory = policy_evaluation(tf_env,
                                        collect_policy,
                                        num_episodes=1,
                                        max_buffer_capacity=200,
                                        use_function=True)

    start_state = test_trajectory.observation[0, 0, :]

    env_model = TFTimeLimit(
        EnvironmentModel(
            model,
            PendulumReward(tf_env.observation_spec(), tf_env.action_spec()),
            ConstantFalseTermination(tf_env.observation_spec()),
            DeterministicInitialStateModel(start_state),
            batch_size=30,
        ),
        max_steps + 1,
    )

    replayed_trajectories = replay_actions_across_batch_transition_models(
        env_model, test_trajectory.action[0])

    prediction_mean = tf.reduce_mean(replayed_trajectories.observation, axis=0)
    np.testing.assert_allclose(prediction_mean,
                               test_trajectory.observation[0],
                               atol=1e-1,
                               rtol=2e-1)
예제 #8
0
def test_tf_env_wrapper_is_reset_at_the_start_of_each_iteration(action_space):

    observations_array = [
        # First iteration
        [StepType.FIRST, StepType.FIRST],
        [StepType.LAST, StepType.MID],
        [StepType.FIRST, StepType.MID],
        [StepType.LAST, StepType.LAST],
        # Second iteration
        [StepType.FIRST, StepType.FIRST],
        [StepType.MID, StepType.MID],
        [StepType.MID, StepType.MID],
        [StepType.MID, StepType.LAST],
        [StepType.MID, StepType.FIRST],
    ]
    observations = [
        tf.concat(ob_array, axis=0) for ob_array in observations_array
    ]

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=2,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False,
                            validate_args=False)
    policy_state_updater = StubPolicyStateUpdater()
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=3,
        population_size=2,
        max_iterations=2,
        policy_state_updater=policy_state_updater,
    )

    initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST),
                                               axis=0),
                                batch_size=1)

    trajectory_optimiser.optimise(initial_time_step, environment_model)

    for stored_trajectories in policy_state_updater.step_types:
        np.testing.assert_equal(stored_trajectories[:, 0], np.array([0, 0]))
예제 #9
0
def test_trajectory_optimiser_pathological_trajectories(
        action_space, horizon, batch_size):
    """
    The replay buffer is a FIFO buffer of fixed capacity. Ensure that the capacity is sufficient
    such that the initial observation is still present in the buffer even in the pathological case
    where all trajectories are of length 2.
    """

    # construct the environment model
    observations = list(
        chain.from_iterable(
            repeat(
                [
                    replicate(tf.constant(StepType.FIRST), [batch_size]),
                    replicate(tf.constant(StepType.LAST), [batch_size]),
                ],
                horizon,
            )))

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=batch_size,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)
    policy = RandomTFPolicy(time_step_space, action_space)
    stub_policy_state_updater = StubPolicyStateUpdater()
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon,
        population_size=batch_size,
        max_iterations=1,
        policy_state_updater=stub_policy_state_updater,
    )

    time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0),
                        batch_size=1)

    trajectory_optimiser.optimise(time_step, environment_model)

    stored_trajectory = stub_policy_state_updater.step_types[0]
    assert stored_trajectory[0][0] == StepType.FIRST
예제 #10
0
def test_decorate_policy_with_particles_action_shapes(
    observation_space, action_space, population_size, number_of_particles
):
    time_step_space = time_step_spec(observation_space)
    policy = RandomTFPolicy(time_step_space, action_space)
    decorated_policy = decorate_policy_with_particles(policy, number_of_particles)

    observation = create_uniform_distribution_from_spec(observation_space).sample(
        sample_shape=(population_size * number_of_particles,)
    )
    initial_time_step = restart(observation, batch_size=population_size * number_of_particles)
    policy_step = decorated_policy.action(initial_time_step)
    actions = policy_step.action
    assert actions.shape == [population_size * number_of_particles] + action_space.shape.dims
예제 #11
0
    def __init__(self, agent, enviroment):
        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            max_length=50000)

        self._random_policy = RandomTFPolicy(train_env.time_step_spec(),
                                             enviroment.action_spec())

        self._fill_buffer(train_env, self._random_policy, steps=100)

        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3, sample_batch_size=BATCH_SIZE,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)
예제 #12
0
    def __init__(self, agent, enviroment, batch_size):
        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            max_length=50000)

        self._random_policy = RandomTFPolicy(enviroment.time_step_spec(),
                                             enviroment.action_spec())

        self._fill_buffer(enviroment, self._random_policy, steps=100)

        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2,
            single_deterministic_pass=False).prefetch(3)

        self.iterator = iter(self.dataset)
예제 #13
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
예제 #14
0
파일: conftest.py 프로젝트: adak32/bellman
def _pendulum_training_data_fixture():
    max_steps = 50
    num_episodes = 80

    tf_env = tf_py_environment.TFPyEnvironment(create_pendulum_environment(max_steps))

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec())

    trajectories = policy_evaluation(
        tf_env,
        collect_policy,
        num_episodes=num_episodes,
        max_buffer_capacity=1000,
        use_function=True,
    )

    tf_env.close()

    return trajectories, tf_env
예제 #15
0
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space):
    """
    This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used
    to generate a rollout. This also serves as an example of how to construct "random shooting"
    rollouts from an environment model.

    The assertion in this test is that selected action has the expected log_prob value consistent
    with optimisers from a uniform distribution. All this is really checking is that the preceeding
    code has run successfully.
    """

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space, action_space)
    wrapped_environment = EnvironmentModel(
        environment,
        ConstantReward(observation_space, action_space, 0.0),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )

    random_policy = RandomTFPolicy(
        wrapped_environment.time_step_spec(), action_space, emit_log_probability=True
    )

    transition_observer = _RecordLastLogProbTransitionObserver()

    driver = DynamicStepDriver(
        env=wrapped_environment,
        policy=random_policy,
        transition_observers=[transition_observer],
    )
    driver.run()

    last_log_prob = transition_observer.last_log_probability

    uniform_distribution = create_uniform_distribution_from_spec(action_space)
    action_log_prob = uniform_distribution.log_prob(transition_observer.action)
    expected = np.sum(action_log_prob.numpy().astype(np.float32))
    actual = np.sum(last_log_prob.numpy())

    np.testing.assert_array_almost_equal(actual, expected, decimal=4)
def test_sample_trajectory_for_mountain_car():
    tf_env = tf_py_environment.TFPyEnvironment(
        suite_gym.load("MountainCar-v0"))

    network = LinearTransitionNetwork(tf_env.observation_spec())
    model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(),
                            -1.0)
    terminates = MountainCarTermination(tf_env.observation_spec())
    initial_state_sampler = MountainCarInitialState(tf_env.observation_spec())
    environment = TFTimeLimit(EnvironmentModel(model, reward, terminates,
                                               initial_state_sampler),
                              duration=200)

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_episodes_per_iteration = 2
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration,
    )

    collect_driver.run()

    trajectory = policy_training_buffer.gather_all()

    first_batch_step_type = trajectory.step_type[0, :]
    assert (first_batch_step_type[0] == StepType.FIRST
            and first_batch_step_type[-1] == StepType.LAST)
예제 #17
0
파일: pendulum.py 프로젝트: adak32/bellman
def generate_pendulum_trajectories(
        batch_size: int, max_steps: int
) -> Tuple[Trajectory, BoundedTensorSpec, BoundedTensorSpec]:
    """
    Utility function for generating batches of trajectories from the Pendulum-v0 gym environment.

    :param batch_size: Number of trajectories to generate
    :param max_steps: Length of trajectories
    :return: A tuple consisting of
        * A `Trajectory` object containing the batch of trajectories
        * The observation spec from the Pendulum-v0 environment
        * The action spec from the Pendulum-v0 environment
    """
    tf_env = tf_py_environment.TFPyEnvironment(
        BatchedPyEnvironment([
            create_pendulum_environment(max_steps) for _ in range(batch_size)
        ]))

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())
    replay_buffer_capacity = 1000
    model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=batch_size,
        max_length=replay_buffer_capacity,
    )

    collect_episodes_per_iteration = 1
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=[model_training_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration,
    )

    collect_driver.run()
    tf_env.close()

    training_data = model_training_buffer.gather_all()

    return training_data, tf_env.observation_spec(), tf_env.action_spec()
예제 #18
0
def test_tf_time_limit_wrapper_with_environment_model(observation_space,
                                                      action_space,
                                                      trajectory_length):
    """
    This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit`
    environment wrapper from TF-Agents.
    """
    ts_spec = time_step_spec(observation_space)

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space,
                                       action_space)
    wrapped_environment = TFTimeLimit(
        EnvironmentModel(
            environment,
            ConstantReward(observation_space, action_space, 0.0),
            ConstantFalseTermination(observation_space),
            create_uniform_initial_state_distribution(observation_space),
        ),
        trajectory_length,
    )

    collect_policy = RandomTFPolicy(ts_spec, action_space)
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        wrapped_environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=1,
    )
    collect_driver.run()

    trajectories = policy_training_buffer.gather_all()

    assert trajectories.step_type.shape == (1, trajectory_length + 1)
예제 #19
0
파일: conftest.py 프로젝트: adak32/bellman
def _mountain_car_data_fixture():
    tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load("MountainCar-v0"))

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec())
    replay_buffer_capacity = 5000
    model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity,
    )

    collect_episodes_per_iteration = 10
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=[model_training_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration,
    )

    collect_driver.run()
    tf_env.close()

    return tf_env, model_training_buffer.gather_all()
예제 #20
0
def test_trajectory_optimiser_with_particles_actions_shape(
        action_space, horizon, population_size, number_of_particles):
    observation = create_uniform_distribution_from_spec(
        OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size *
                                                     number_of_particles, ))
    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, repeat(observation))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=population_size * number_of_particles,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        max_iterations=2,
    )

    initial_time_step = restart(tf.expand_dims(observation[0], axis=0))
    optimal_actions = trajectory_optimiser.optimise(initial_time_step,
                                                    environment_model)

    assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
예제 #21
0
    def create_real_drivers(
        self,
        real_replay_buffer: ReplayBuffer,
        train_metrics: List[TFStepMetric],
    ) -> Tuple[Driver, Driver]:
        """
        Create the drivers for interacting with the real environment.

        This method creates two drivers: one uses the agent's "collect" policy, the other uses a
        uniform random policy.

        Note that the random policy is defined with the same `info_spec` as the agent's "collect"
        policy. The `info_spec` of the trajectories generated by the random policy must conform to
        the expectations of the agent when the data is used for training.
        """
        agent_collect_driver = TFDriver(
            self._environment,
            self._agent.collect_policy,
            observers=[real_replay_buffer.add_batch] + train_metrics,
            max_steps=self._max_steps,
            disable_tf_function=not self._use_tf_function,
        )
        random_policy = RandomTFPolicy(
            self._environment.time_step_spec(),
            self._environment.action_spec(),
            info_spec=self._agent.collect_policy.info_spec,
        )
        random_policy_collect_driver = TFDriver(
            self._environment,
            random_policy,
            observers=[real_replay_buffer.add_batch] + train_metrics,
            max_steps=self._max_steps,
            disable_tf_function=not self._use_tf_function,
        )

        return agent_collect_driver, random_policy_collect_driver
예제 #22
0
def random_shooting_trajectory_optimisation(
    time_step_spec: TimeStep,
    action_spec: types.NestedTensorSpec,
    horizon: int,
    population_size: int,
    number_of_particles: int,
) -> TrajectoryOptimiser:
    """
    Construct a trajectory optimiser which uses the random shooting method. This method relies
    on `RandomTFPolicy` as a uniformly random policy.

    :param time_step_spec: A `TimeStep` spec of the expected time_steps.
    :param action_spec: A nest of BoundedTensorSpec representing the actions.
    :param horizon: Number of steps taken in the environment in each virtual rollout.
    :param population_size: The number of candidate sequences of actions at each iteration.
    :param number_of_particles: Number of monte-carlo rollouts of each action trajectory.

    :return: A `TrajectoryOptimiser` object which uses the random shooting method.
    """
    policy = RandomTFPolicy(time_step_spec, action_spec)
    trajectory_optimiser = PolicyTrajectoryOptimiser(policy, horizon,
                                                     population_size,
                                                     number_of_particles)
    return trajectory_optimiser
예제 #23
0
logging.getLogger().setLevel(logging.INFO)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

collect_driver = DynamicStepDriver(
    tf_env,  # Env to play with
    agent.collect_policy,  # Collect policy of the agent
    observers=[replay_buffer_observer] +
    train_metrics,  # pass to all observers
    num_steps=1)
# Speed up as tensorflow function
collect_driver.run = function(collect_driver.run)

initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                        tf_env.action_spec())
init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers=[replay_buffer.add_batch,
               ShowProgress(init_replay_buffer)],
    num_steps=init_replay_buffer)
final_time_step, final_policy_state = init_driver.run()

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

dataset = replay_buffer.as_dataset(
    sample_batch_size=dataset_sample_batch_size,
    num_steps=dataset_num_steps,
예제 #24
0
def breakout_v4(seed=42):
    env = suite_gym.load("Breakout-v4")
    env.seed(seed)
    env.reset()

    repeating_env = ActionRepeat(env, times=4)
    for name in dir(tf_agents.environments.wrappers):
        obj = getattr(tf_agents.environments.wrappers, name)
        if hasattr(obj, "__base__") and issubclass(
                obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):
            print("{:27s} {}".format(name, obj.__doc__.split("\n")[0]))

    limited_repeating_env = suite_gym.load(
        "Breakout-v4",
        gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)],
        env_wrappers=[partial(ActionRepeat, times=4)],
    )

    max_episode_steps = 27000  # <=> 108k ALE frames since 1 step = 4 frames
    environment_name = "BreakoutNoFrameskip-v4"

    env = suite_atari.load(
        environment_name,
        max_episode_steps=max_episode_steps,
        gym_env_wrappers=[AtariPreprocessing, FrameStack4],
    )

    env.seed(42)
    env.reset()
    time_step = env.step(np.array(1))  # FIRE
    for _ in range(4):
        time_step = env.step(np.array(3))  # LEFT

    def plot_observation(obs):
        # Since there are only 3 color channels, you cannot display 4 frames
        # with one primary color per frame. So this code computes the delta between
        # the current frame and the mean of the other frames, and it adds this delta
        # to the red and blue channels to get a pink color for the current frame.
        obs = obs.astype(np.float32)
        img_ = obs[..., :3]
        current_frame_delta = np.maximum(
            obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0)
        img_[..., 0] += current_frame_delta
        img_[..., 2] += current_frame_delta
        img_ = np.clip(img_ / 150, 0, 1)
        plt.imshow(img_)
        plt.axis("off")

    plt.figure(figsize=(6, 6))
    plot_observation(time_step.observation)
    plt.tight_layout()
    plt.savefig("./images/preprocessed_breakout_plot.png",
                format="png",
                dpi=300)
    plt.show()

    tf_env = TFPyEnvironment(env)

    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.0)
    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        preprocessing_layers=preprocessing_layer,
        conv_layer_params=conv_layer_params,
        fc_layer_params=fc_layer_params,
    )

    # see TF-agents issue #113
    # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
    #                                     epsilon=0.00001, centered=True)

    train_step = tf.Variable(0)
    update_period = 4  # run a training step every 4 collect steps
    optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4,
                                                    decay=0.95,
                                                    momentum=0.0,
                                                    epsilon=0.00001,
                                                    centered=True)
    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
        end_learning_rate=0.01,
    )  # final ε
    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=2000,  # <=> 32,000 ALE frames
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step),
    )
    agent.initialize()

    from tf_agents.replay_buffers import tf_uniform_replay_buffer

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000000)

    replay_buffer_observer = replay_buffer.add_batch

    class ShowProgress:
        def __init__(self, total):
            self.counter = 0
            self.total = total

        def __call__(self, trajectory):
            if not trajectory.is_boundary():
                self.counter += 1
            if self.counter % 100 == 0:
                print("\r{}/{}".format(self.counter, self.total), end="")

    from tf_agents.metrics import tf_metrics

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    from tf_agents.eval.metric_utils import log_metrics
    import logging

    logging.getLogger().setLevel(logging.INFO)
    log_metrics(train_metrics)

    from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

    collect_driver = DynamicStepDriver(
        tf_env,
        agent.collect_policy,
        observers=[replay_buffer_observer] + train_metrics,
        num_steps=update_period,
    )  # collect 4 steps for each training iteration

    from tf_agents.policies.random_tf_policy import RandomTFPolicy

    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer.add_batch,
                   ShowProgress(20000)],
        num_steps=20000,
    )  # <=> 80,000 ALE frames
    final_time_step, final_policy_state = init_driver.run()
예제 #25
0
def test_all_mepo_variants_work(transition_model, trajectory_sampler,
                                model_free_agent_type):
    """
    Mepo Agent has prespecified transition model, trajectory sampler and model-free agent
    types. Here we check that all combinations execute without errors.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    agent = MepoAgent(
        time_step_spec,
        action_spec,
        transition_model,
        1,
        10,
        tf.nn.relu,
        ensemble_size,
        False,
        1,
        1,
        [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
        reward_model,
        initial_state_distribution_model,
        trajectory_sampler,
        horizon,
        population_size,
        model_free_agent_type,
        1,
        10,
        tf.nn.relu,
        2,
    )

    # we need some training data
    random_policy = RandomTFPolicy(
        time_step_spec,
        action_spec,
        info_spec=agent.collect_policy.info_spec,
    )
    model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        random_policy.trajectory_spec, batch_size=1, max_length=1000)
    collect_driver_random_policy = TFDriver(
        tf_env,
        random_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    initial_time_step = tf_env.reset()
    collect_driver_random_policy.run(initial_time_step)
    pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10)
    tf_training_scheduler = pets_agent_trainer.create_training_scheduler(
        agent, model_training_buffer)
    training_losses = tf_training_scheduler.maybe_train(
        tf.constant(10, dtype=tf.int64))
    assert EnvironmentModelComponents.TRANSITION in training_losses

    # test the agent
    collect_driver_planning_policy = TFDriver(
        tf_env,
        agent.collect_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
예제 #26
0
    def compile_model(self,
                      X_train,
                      y_train,
                      layers: list = [],
                      imb_ratio: float = None,
                      loss_fn=common.element_wise_squared_loss) -> None:
        """Initializes the neural networks, DDQN-agent, collect policies and replay buffer.

        :param X_train: Training data for the model.
        :type  X_train: np.ndarray
        :param y_train: Labels corresponding to `X_train`.  1 for the positive class, 0 for the negative class.
        :param y_train: np.ndarray
        :param layers: List of layers to feed into the TF-agents custom Sequential(!) layer.
        :type  layers: list
        :param imb_ratio: The imbalance ratio of the data.
        :type  imb_ratio: float
        :param loss_fn: Callable loss function
        :type  loss_fn: tf.compat.v1.losses

        :return: None
        :rtype: NoneType
        """
        if imb_ratio is None:
            imb_ratio = imbalance_ratio(y_train)

        self.train_env = TFPyEnvironment(
            ClassifierEnv(X_train, y_train, imb_ratio))
        self.global_episode = tf.Variable(
            0, name="global_episode", dtype=np.int64,
            trainable=False)  # Global train episode counter

        # Custom epsilon decay: https://github.com/tensorflow/agents/issues/339
        epsilon_decay = tf.compat.v1.train.polynomial_decay(
            1.0,
            self.global_episode,
            self.decay_episodes,
            end_learning_rate=self.min_epsilon)

        self.q_net = Sequential(layers, self.train_env.observation_spec())

        self.agent = DdqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=self.q_net,
            optimizer=Adam(learning_rate=self.learning_rate),
            td_errors_loss_fn=loss_fn,
            train_step_counter=self.global_episode,
            target_update_period=self.target_update_period,
            target_update_tau=self.target_update_tau,
            gamma=self.gamma,
            epsilon_greedy=epsilon_decay,
            n_step_update=self.n_step_update,
            gradient_clipping=self.gradient_clipping)
        self.agent.initialize()

        self.random_policy = RandomTFPolicy(self.train_env.time_step_spec(),
                                            self.train_env.action_spec())
        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.memory_length)

        self.warmup_driver = DynamicStepDriver(
            self.train_env,
            self.random_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.warmup_steps)  # Uses a random policy

        self.collect_driver = DynamicStepDriver(
            self.train_env,
            self.agent.collect_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.collect_steps_per_episode
        )  # Uses the epsilon-greedy policy of the agent

        self.agent.train = common.function(self.agent.train)  # Optimalization
        self.warmup_driver.run = common.function(self.warmup_driver.run)
        self.collect_driver.run = common.function(self.collect_driver.run)

        self.compiled = True
예제 #27
0
class TrainDDQN():
    """Wrapper for DDQN training, validation, saving etc."""
    def __init__(self,
                 episodes: int,
                 warmup_steps: int,
                 learning_rate: float,
                 gamma: float,
                 min_epsilon: float,
                 decay_episodes: int,
                 model_path: str = None,
                 log_dir: str = None,
                 batch_size: int = 64,
                 memory_length: int = None,
                 collect_steps_per_episode: int = 1,
                 val_every: int = None,
                 target_update_period: int = 1,
                 target_update_tau: float = 1.0,
                 progressbar: bool = True,
                 n_step_update: int = 1,
                 gradient_clipping: float = 1.0,
                 collect_every: int = 1) -> None:
        """
        Wrapper to make training easier.
        Code is partly based of https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial

        :param episodes: Number of training episodes
        :type  episodes: int
        :param warmup_steps: Number of episodes to fill Replay Buffer with random state-action pairs before training starts
        :type  warmup_steps: int
        :param learning_rate: Learning Rate for the Adam Optimizer
        :type  learning_rate: float
        :param gamma: Discount factor for the Q-values
        :type  gamma: float
        :param min_epsilon: Lowest and final value for epsilon
        :type  min_epsilon: float
        :param decay_episodes: Amount of episodes to decay from 1 to `min_epsilon`
        :type  decay_episodes: int
        :param model_path: Location to save the trained model
        :type  model_path: str
        :param log_dir: Location to save the logs, usefull for TensorBoard
        :type  log_dir: str
        :param batch_size: Number of samples in minibatch to train on each step
        :type  batch_size: int
        :param memory_length: Maximum size of the Replay Buffer
        :type  memory_length: int
        :param collect_steps_per_episode: Amount of data to collect for Replay Buffer each episiode
        :type  collect_steps_per_episode: int
        :param collect_every: Step interval to collect data during training
        :type  collect_every: int
        :param val_every: Validate the model every X episodes using the `collect_metrics()` function
        :type  val_every: int
        :param target_update_period: Update the target Q-network every X episodes
        :type  target_update_period: int
        :param target_update_tau: Parameter for softening the `target_update_period`
        :type  target_update_tau: float
        :param progressbar: Enable or disable the progressbar for collecting data and training
        :type  progressbar: bool

        :return: None
        :rtype: NoneType
        """
        self.episodes = episodes  # Total episodes
        self.warmup_steps = warmup_steps  # Amount of warmup steps before training
        self.batch_size = batch_size  # Batch size of Replay Memory
        self.collect_steps_per_episode = collect_steps_per_episode  # Amount of steps to collect data each episode
        self.collect_every = collect_every  # Step interval to collect data during training
        self.learning_rate = learning_rate  # Learning Rate
        self.gamma = gamma  # Discount factor
        self.min_epsilon = min_epsilon  # Minimal chance of choosing random action
        self.decay_episodes = decay_episodes  # Number of episodes to decay from 1.0 to `EPSILON`
        self.target_update_period = target_update_period  # Period for soft updates
        self.target_update_tau = target_update_tau
        self.progressbar = progressbar  # Enable or disable the progressbar for collecting data and training
        self.n_step_update = n_step_update
        self.gradient_clipping = gradient_clipping  # Clip the loss
        self.compiled = False
        NOW = datetime.now().strftime("%Y%m%d_%H%M%S")

        if memory_length is not None:
            self.memory_length = memory_length  # Max Replay Memory length
        else:
            self.memory_length = warmup_steps

        if val_every is not None:
            self.val_every = val_every  # Validate the policy every `val_every` episodes
        else:
            self.val_every = self.episodes // min(
                50, self.episodes
            )  # Can't validate the model 50 times if self.episodes < 50

        if model_path is not None:
            self.model_path = model_path
        else:
            self.model_path = "./models/" + NOW + ".pkl"

        if log_dir is None:
            log_dir = "./logs/" + NOW
        self.writer = tf.summary.create_file_writer(log_dir)

    def compile_model(self,
                      X_train,
                      y_train,
                      layers: list = [],
                      imb_ratio: float = None,
                      loss_fn=common.element_wise_squared_loss) -> None:
        """Initializes the neural networks, DDQN-agent, collect policies and replay buffer.

        :param X_train: Training data for the model.
        :type  X_train: np.ndarray
        :param y_train: Labels corresponding to `X_train`.  1 for the positive class, 0 for the negative class.
        :param y_train: np.ndarray
        :param layers: List of layers to feed into the TF-agents custom Sequential(!) layer.
        :type  layers: list
        :param imb_ratio: The imbalance ratio of the data.
        :type  imb_ratio: float
        :param loss_fn: Callable loss function
        :type  loss_fn: tf.compat.v1.losses

        :return: None
        :rtype: NoneType
        """
        if imb_ratio is None:
            imb_ratio = imbalance_ratio(y_train)

        self.train_env = TFPyEnvironment(
            ClassifierEnv(X_train, y_train, imb_ratio))
        self.global_episode = tf.Variable(
            0, name="global_episode", dtype=np.int64,
            trainable=False)  # Global train episode counter

        # Custom epsilon decay: https://github.com/tensorflow/agents/issues/339
        epsilon_decay = tf.compat.v1.train.polynomial_decay(
            1.0,
            self.global_episode,
            self.decay_episodes,
            end_learning_rate=self.min_epsilon)

        self.q_net = Sequential(layers, self.train_env.observation_spec())

        self.agent = DdqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=self.q_net,
            optimizer=Adam(learning_rate=self.learning_rate),
            td_errors_loss_fn=loss_fn,
            train_step_counter=self.global_episode,
            target_update_period=self.target_update_period,
            target_update_tau=self.target_update_tau,
            gamma=self.gamma,
            epsilon_greedy=epsilon_decay,
            n_step_update=self.n_step_update,
            gradient_clipping=self.gradient_clipping)
        self.agent.initialize()

        self.random_policy = RandomTFPolicy(self.train_env.time_step_spec(),
                                            self.train_env.action_spec())
        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.memory_length)

        self.warmup_driver = DynamicStepDriver(
            self.train_env,
            self.random_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.warmup_steps)  # Uses a random policy

        self.collect_driver = DynamicStepDriver(
            self.train_env,
            self.agent.collect_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.collect_steps_per_episode
        )  # Uses the epsilon-greedy policy of the agent

        self.agent.train = common.function(self.agent.train)  # Optimalization
        self.warmup_driver.run = common.function(self.warmup_driver.run)
        self.collect_driver.run = common.function(self.collect_driver.run)

        self.compiled = True

    def train(self, *args) -> None:
        """Starts the training of the model. Includes warmup period, metrics collection and model saving.

        :param *args: All arguments will be passed to `collect_metrics()`.
            This can be usefull to pass callables, testing environments or validation data.
            Overwrite the TrainDDQN.collect_metrics() function to use your own *args.
        :type  *args: Any

        :return: None
        :rtype: NoneType, last step is saving the model as a side-effect
        """
        assert self.compiled, "Model must be compiled with model.compile_model(X_train, y_train, layers) before training."

        # Warmup period, fill memory with random actions
        if self.progressbar:
            print(
                f"\033[92mCollecting data for {self.warmup_steps:_} steps... This might take a few minutes...\033[0m"
            )

        self.warmup_driver.run(
            time_step=None,
            policy_state=self.random_policy.get_initial_state(
                self.train_env.batch_size))

        if self.progressbar:
            print(
                f"\033[92m{self.replay_buffer.num_frames():_} frames collected!\033[0m"
            )

        dataset = self.replay_buffer.as_dataset(
            sample_batch_size=self.batch_size,
            num_steps=self.n_step_update + 1,
            num_parallel_calls=data.experimental.AUTOTUNE).prefetch(
                data.experimental.AUTOTUNE)
        iterator = iter(dataset)

        def _train():
            experiences, _ = next(iterator)
            return self.agent.train(experiences).loss

        _train = common.function(_train)  # Optimalization

        ts = None
        policy_state = self.agent.collect_policy.get_initial_state(
            self.train_env.batch_size)
        self.collect_metrics(*args)  # Initial collection for step 0
        pbar = tqdm(total=self.episodes,
                    disable=(not self.progressbar),
                    desc="Training the DDQN")  # TQDM progressbar
        for _ in range(self.episodes):
            if not self.global_episode % self.collect_every:
                # Collect a few steps using collect_policy and save to `replay_buffer`
                if self.collect_steps_per_episode != 0:
                    ts, policy_state = self.collect_driver.run(
                        time_step=ts, policy_state=policy_state)
                pbar.update(
                    self.collect_every
                )  # More stable TQDM updates, collecting could take some time

            # Sample a batch of data from `replay_buffer` and update the agent's network
            train_loss = _train()

            if not self.global_episode % self.val_every:
                with self.writer.as_default():
                    tf.summary.scalar("train_loss",
                                      train_loss,
                                      step=self.global_episode)

                self.collect_metrics(*args)
        pbar.close()

    def collect_metrics(self,
                        X_val: np.ndarray,
                        y_val: np.ndarray,
                        save_best: str = None):
        """Collects metrics using the trained Q-network.

        :param X_val: Features of validation data, same shape as X_train
        :type  X_val: np.ndarray
        :param y_val: Labels of validation data, same shape as y_train
        :type  y_val: np.ndarray
        :param save_best: Saving the best model of all validation runs based on given metric:
            Choose one of: {Gmean, F1, Precision, Recall, TP, TN, FP, FN}
            This improves stability since the model at the last episode is not guaranteed to be the best model.
        :type  save_best: str
        """
        y_pred = network_predictions(self.agent._target_q_network, X_val)
        stats = classification_metrics(y_val, y_pred)
        avgQ = np.mean(decision_function(self.agent._target_q_network,
                                         X_val))  # Max action for each x in X

        if save_best is not None:
            if not hasattr(self, "best_score"):  # If no best model yet
                self.best_score = 0.0

            if stats.get(save_best) >= self.best_score:  # Overwrite best model
                self.save_network(
                )  # Saving directly to avoid shallow copy without trained weights
                self.best_score = stats.get(save_best)

        with self.writer.as_default():
            tf.summary.scalar(
                "AverageQ", avgQ,
                step=self.global_episode)  # Average Q-value for this epoch
            for k, v in stats.items():
                tf.summary.scalar(k, v, step=self.global_episode)

    def evaluate(self, X_test, y_test, X_train=None, y_train=None):
        """
        Final evaluation of trained Q-network with X_test and y_test.
        Optional PR and ROC curve comparison to X_train, y_train to ensure no overfitting is taking place.

        :param X_test: Features of test data, same shape as X_train
        :type  X_test: np.ndarray
        :param y_test: Labels of test data, same shape as y_train
        :type  y_test: np.ndarray
        :param X_train: Features of train data
        :type  X_train: np.ndarray
        :param y_train: Labels of train data
        :type  y_train: np.ndarray
        """
        if hasattr(self, "best_score"):
            print(f"\033[92mBest score: {self.best_score:6f}!\033[0m")
            network = self.load_network(
                self.model_path)  # Load best saved model
        else:
            network = self.agent._target_q_network  # Load latest target model

        if (X_train is not None) and (y_train is not None):
            plot_pr_curve(network, X_test, y_test, X_train, y_train)
            plot_roc_curve(network, X_test, y_test, X_train, y_train)

        y_pred = network_predictions(network, X_test)
        return classification_metrics(y_test, y_pred)

    def save_network(self):
        """Saves Q-network as pickle to `model_path`."""
        with open(self.model_path, "wb") as f:  # Save Q-network as pickle
            pickle.dump(self.agent._target_q_network, f)

    @staticmethod
    def load_network(fp: str):
        """Static method to load Q-network pickle from given filepath.

        :param fp: Filepath to the saved pickle of the network
        :type  fp: str

        :returns: The network-object loaded from a pickle file.
        :rtype: tensorflow.keras.models.Model
        """
        with open(fp, "rb") as f:  # Load the Q-network
            network = pickle.load(f)
        return network
예제 #28
0
def create_random_policy(env: TFEnvironment):
    return RandomTFPolicy(action_spec=env.action_spec(),
                          time_step_spec=env.time_step_spec())
예제 #29
0

def collect_step(environment, policy):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    replay_buffer.add_batch(traj)


# execute the random policy in the environment for a few steps
# and record the data (observations, actions, rewards etc) in the replay buffer
print("Collecting initial random steps")
random_policy = RandomTFPolicy(train_env.time_step_spec(),
                               train_env.action_spec())
for _ in range(initial_collect_steps):
    collect_step(train_env, random_policy)

dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                   sample_batch_size=batch_size,
                                   num_steps=2).prefetch(3)
iterator = iter(dataset)

# train the agent
print("Training the agent")
tf_agent.train = common.function(tf_agent.train)

# Reset the train step
tf_agent.train_step_counter.assign(0)
예제 #30
0
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")