Exemplo n.º 1
0
    def __init__(
        self,
        time_step_spec: TimeStep,
        action_spec: NestedTensorSpec,
        transition_model: Union[Tuple[TrainableTransitionModel,
                                      TransitionModelTrainingSpec],
                                TransitionModel],
        reward_model: RewardModel,
        initial_state_distribution_model: InitialStateDistributionModel,
        trajectory_optimiser: TrajectoryOptimiser,
        debug_summaries: bool = False,
        train_step_counter: Optional[tf.Variable] = None,
    ):
        """
        :param time_step_spec: A nest of tf.TypeSpec representing the time_steps.
        :param action_spec: A nest of BoundedTensorSpec representing the actions.
        :param transition_model: A component of the environment model that describes the
            transition dynamics. Either a tuple containing a trainable transition model together
            with training specs, or a pre-specified transition model.
        :param reward_model: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. the agent
            assumes the reward function is known.
        :param initial_state_distribution_model: A component of the environment model that
            describes the initial state distribution (can be both deterministic or
            probabilistic). At the moment only pre-specified initial state distribution models
            are allowed, i.e. the agent assumes the initial state distribution is known.
        :param trajectory_optimiser: A TrajectoryOptimiser which takes an environment model and
            optimises a sequence of actions over a given horizon.
        :param debug_summaries: A bool; if true, subclasses should gather debug summaries.
        :param train_step_counter: An optional counter to increment every time the train op is run.
            Defaults to the global_step.
        """

        # setting up the environment model and policy
        if isinstance(transition_model, tuple):
            _transition_model, _ = transition_model
        else:
            _transition_model = transition_model  # type: ignore
        environment_model = EnvironmentModel(
            transition_model=_transition_model,
            reward_model=reward_model,
            termination_model=ConstantFalseTermination(
                _transition_model.observation_space_spec),
            initial_state_distribution_model=initial_state_distribution_model,
        )
        planning_policy = PlanningPolicy(environment_model,
                                         trajectory_optimiser)

        super().__init__(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            environment_model.termination_model,
            initial_state_distribution_model,
            planning_policy,
            planning_policy,
            debug_summaries=debug_summaries,
            train_step_counter=train_step_counter,
        )
Exemplo n.º 2
0
def assert_rollouts_are_close_to_actuals(model, max_steps):
    tf_env = tf_py_environment.TFPyEnvironment(
        create_pendulum_environment(max_steps))
    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())

    test_trajectory = policy_evaluation(tf_env,
                                        collect_policy,
                                        num_episodes=1,
                                        max_buffer_capacity=200,
                                        use_function=True)

    start_state = test_trajectory.observation[0, 0, :]

    env_model = TFTimeLimit(
        EnvironmentModel(
            model,
            PendulumReward(tf_env.observation_spec(), tf_env.action_spec()),
            ConstantFalseTermination(tf_env.observation_spec()),
            DeterministicInitialStateModel(start_state),
            batch_size=30,
        ),
        max_steps + 1,
    )

    replayed_trajectories = replay_actions_across_batch_transition_models(
        env_model, test_trajectory.action[0])

    prediction_mean = tf.reduce_mean(replayed_trajectories.observation, axis=0)
    np.testing.assert_allclose(prediction_mean,
                               test_trajectory.observation[0],
                               atol=1e-1,
                               rtol=2e-1)
Exemplo n.º 3
0
def get_optimiser_and_environment_model(
        time_step_space,
        observation_space,
        action_space,
        population_size,
        number_of_particles,
        horizon,
        optimiser_policy_trajectory_optimiser_factory,
        sample_shape=(),
):
    reward = ConstantReward(observation_space, action_space, -1.0)

    batched_transition_network = DummyEnsembleTransitionNetwork(
        observation_space)
    batched_transition_model = KerasTransitionModel(
        [batched_transition_network],
        observation_space,
        action_space,
    )

    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=sample_shape)
    environment_model = EnvironmentModel(
        transition_model=batched_transition_model,
        reward_model=reward,
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=population_size,
    )
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, horizon, population_size,
        number_of_particles)
    return trajectory_optimiser, environment_model
Exemplo n.º 4
0
def test_replay_actions_across_batches(observation_space, action_space,
                                       horizon, batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = TFTimeLimit(
        EnvironmentModel(transition_model, reward, termination,
                         initial_state_sampler, batch_size),
        horizon,
    )

    actions_distribution = create_uniform_initial_state_distribution(
        observation_space)
    actions = actions_distribution.sample((horizon, ))
    trajectory = replay_actions_across_batch_transition_models(
        env_model, actions)

    assert (trajectory.observation.shape == (
        batch_size,
        horizon,
    ) + observation_space.shape)
def test_batched_environment_model(observation_space, action_space,
                                   batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = EnvironmentModel(transition_model, reward, termination,
                                 initial_state_sampler, batch_size)
    action_distr = create_uniform_distribution_from_spec(action_space)
    single_action = action_distr.sample()
    batch_actions = tf.convert_to_tensor(
        [single_action for _ in range(batch_size)])

    first_step = env_model.reset()
    assert (first_step.step_type == [
        StepType.FIRST for _ in range(batch_size)
    ]).numpy().all()
    assert first_step.observation.shape == [batch_size] + list(
        observation_space.shape)

    next_step = env_model.step(batch_actions)
    assert (next_step.step_type == [StepType.MID
                                    for _ in range(batch_size)]).numpy().all()
    assert next_step.observation.shape == [batch_size] + list(
        observation_space.shape)
    assert next_step.reward.shape == [batch_size]
Exemplo n.º 6
0
def test_generate_virtual_rollouts(observation_space, action_space, batch_size,
                                   horizon):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    random_policy = RandomTFPolicy(time_step_spec(observation_space),
                                   action_space)

    replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver(
        env_model, random_policy, horizon)

    driver.run(wrapped_env_model.reset())
    trajectory = replay_buffer.gather_all()

    mid_steps = repeat(1, horizon - 1)
    expected_step_types = tf.constant(list(chain([0], mid_steps, [2])))
    batched_step_types = replicate(expected_step_types, (batch_size, ))
    np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
Exemplo n.º 7
0
def _create_wrapped_environment(observation_space, action_space, reward):
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    return EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space, reward),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
def get_cross_entropy_policy(observation_space, action_space, horizon,
                             batch_size):
    time_step_space = time_step_spec(observation_space)
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        batch_size,
    )
    policy = CrossEntropyMethodPolicy(time_step_space, action_space, horizon,
                                      batch_size)
    return env_model, policy
Exemplo n.º 9
0
def _wrapped_environment_fixture(observation_space, action_space, batch_size):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    wrapped_environment_model = TFTimeLimit(env_model, 2)

    action = create_uniform_distribution_from_spec(action_space).sample(
        (batch_size, ))

    return wrapped_environment_model, action
Exemplo n.º 10
0
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space):
    """
    This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used
    to generate a rollout. This also serves as an example of how to construct "random shooting"
    rollouts from an environment model.

    The assertion in this test is that selected action has the expected log_prob value consistent
    with optimisers from a uniform distribution. All this is really checking is that the preceeding
    code has run successfully.
    """

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space, action_space)
    wrapped_environment = EnvironmentModel(
        environment,
        ConstantReward(observation_space, action_space, 0.0),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )

    random_policy = RandomTFPolicy(
        wrapped_environment.time_step_spec(), action_space, emit_log_probability=True
    )

    transition_observer = _RecordLastLogProbTransitionObserver()

    driver = DynamicStepDriver(
        env=wrapped_environment,
        policy=random_policy,
        transition_observers=[transition_observer],
    )
    driver.run()

    last_log_prob = transition_observer.last_log_probability

    uniform_distribution = create_uniform_distribution_from_spec(action_space)
    action_log_prob = uniform_distribution.log_prob(transition_observer.action)
    expected = np.sum(action_log_prob.numpy().astype(np.float32))
    actual = np.sum(last_log_prob.numpy())

    np.testing.assert_array_almost_equal(actual, expected, decimal=4)
Exemplo n.º 11
0
def test_tf_time_limit_wrapper_with_environment_model(observation_space,
                                                      action_space,
                                                      trajectory_length):
    """
    This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit`
    environment wrapper from TF-Agents.
    """
    ts_spec = time_step_spec(observation_space)

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space,
                                       action_space)
    wrapped_environment = TFTimeLimit(
        EnvironmentModel(
            environment,
            ConstantReward(observation_space, action_space, 0.0),
            ConstantFalseTermination(observation_space),
            create_uniform_initial_state_distribution(observation_space),
        ),
        trajectory_length,
    )

    collect_policy = RandomTFPolicy(ts_spec, action_space)
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        wrapped_environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=1,
    )
    collect_driver.run()

    trajectories = policy_training_buffer.gather_all()

    assert trajectories.step_type.shape == (1, trajectory_length + 1)
Exemplo n.º 12
0
def test_invalid_num_elites(observation_space, action_space, horizon):

    # some fixed parameters
    population_size = 10
    number_of_particles = 1

    # set up the environment model
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    environment_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        population_size,
    )

    # set up the trajectory optimizer
    time_step_space = time_step_spec(observation_space)
    optimiser = cross_entropy_method_trajectory_optimisation(
        time_step_space,
        action_space,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        num_elites=population_size + 1,
        learning_rate=0.1,
        max_iterations=1,
    )

    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    initial_time_step = restart(observation, batch_size=1)

    # run
    with pytest.raises(AssertionError) as excinfo:
        optimiser.optimise(initial_time_step, environment_model)

    assert "num_elites" in str(excinfo)
Exemplo n.º 13
0
def test_mismatch_between_optimizer_and_environment_model_batch_size(
        observation_space, action_space,
        optimiser_policy_trajectory_optimiser_factory):
    time_step_space = time_step_spec(observation_space)
    environment_model = EnvironmentModel(
        StubTrainableTransitionModel(observation_space,
                                     action_space,
                                     predict_state_difference=True),
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
    population_size = environment_model.batch_size + 1
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, 1, population_size, 1)
    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    time_step = restart(observation, batch_size=1)
    with pytest.raises(AssertionError) as excinfo:
        _ = trajectory_optimiser.optimise(time_step, environment_model)

    assert "batch_size parameter is not equal to environment_model.batch_size" in str(
        excinfo)
Exemplo n.º 14
0
def test_trajectory_optimiser_with_particles_actions_shape(
        action_space, horizon, population_size, number_of_particles):
    observation = create_uniform_distribution_from_spec(
        OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size *
                                                     number_of_particles, ))
    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, repeat(observation))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=population_size * number_of_particles,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        max_iterations=2,
    )

    initial_time_step = restart(tf.expand_dims(observation[0], axis=0))
    optimal_actions = trajectory_optimiser.optimise(initial_time_step,
                                                    environment_model)

    assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
Exemplo n.º 15
0
def test_constant_termination(observation_space):
    constant_termination = ConstantFalseTermination(observation_space)

    observation = create_uniform_distribution_from_spec(observation_space).sample()

    assert not constant_termination.terminates(observation)
Exemplo n.º 16
0
    def __init__(
        self,
        transition_model: Union[
            Tuple[TrainableTransitionModel, TransitionModelTrainingSpec], TransitionModel
        ],
        reward_model: RewardModel,
        initial_state_distribution_model: InitialStateDistributionModel,
        model_free_agent: TFAgent,
        planner_batch_size: int,
        planning_horizon: int,
        model_free_training_iterations: int,
        debug_summaries=False,
        train_step_counter=None,
    ):
        """
        :param transition_model: A component of the environment model that describes the
            transition dynamics. Either a tuple containing a trainable transition model together
            with training specs, or a pre-specified transition model.
        :param reward_model: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. the agent
            assumes the reward function is known.
        :param initial_state_distribution_model: A component of the environment model that
            describes the initial state distribution (can be both deterministic or
            probabilistic). At the moment only pre-specified initial state distribution models
            are allowed, i.e. the agent assumes the initial state distribution is known.
        :param model_free_agent: model-free agent that is trained virtually with samples from the
            model. Could be for example PPO or TRPO which are on-policy, as well as DDPG, SAC or
            TD3 which are off-policy.
        :param planner_batch_size: Number of parallel virtual trajectories.
        :param planning_horizon: Number of steps taken in the environment in each virtual rollout.
        :param model_free_training_iterations: Number of model-free training iterations per each
            train-call.
        :param debug_summaries: A bool; if true, subclasses should gather debug summaries.
        :param train_step_counter: An optional counter to increment every time the train op is run.
            Defaults to the global_step.
        """
        assert planner_batch_size > 0, "Planner batch size must be positive."
        assert planning_horizon > 0, "Planning horizon must be positive."
        assert (
            model_free_training_iterations > 0
        ), "Model-free train iterations must be positive."

        self._assert_model_free_agent(model_free_agent)

        # for setting up the environment model and policy
        if isinstance(transition_model, tuple):
            _transition_model, _ = transition_model
        else:
            _transition_model = transition_model  # type: ignore

        assert (
            _transition_model.observation_space_spec
            == model_free_agent.time_step_spec.observation
        ), "Transition model observation spec needs to match the model-free agent's one."
        assert (
            _transition_model.action_space_spec == model_free_agent.action_spec
        ), "Transition model action spec needs to match the model-free agent's one."

        super().__init__(
            model_free_agent.time_step_spec,
            model_free_agent.action_spec,
            transition_model,
            reward_model,
            ConstantFalseTermination(_transition_model.observation_space_spec),
            initial_state_distribution_model,
            model_free_agent.policy,
            model_free_agent.collect_policy,
            debug_summaries=debug_summaries,
            train_step_counter=train_step_counter,
        )

        environment_model = EnvironmentModel(
            transition_model=_transition_model,
            reward_model=reward_model,
            termination_model=self.termination_model,
            initial_state_distribution_model=initial_state_distribution_model,
            batch_size=planner_batch_size,
        )

        (
            self._virtual_rollouts_replay_buffer,
            self._virtual_rollouts_driver,
            self._environment_model,
        ) = virtual_rollouts_buffer_and_driver(
            environment_model, model_free_agent.collect_policy, planning_horizon
        )

        self._model_free_agent = model_free_agent
        self._model_free_training_iterations = model_free_training_iterations