示例#1
0
def test_replay_actions_across_batches(observation_space, action_space,
                                       horizon, batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = TFTimeLimit(
        EnvironmentModel(transition_model, reward, termination,
                         initial_state_sampler, batch_size),
        horizon,
    )

    actions_distribution = create_uniform_initial_state_distribution(
        observation_space)
    actions = actions_distribution.sample((horizon, ))
    trajectory = replay_actions_across_batch_transition_models(
        env_model, actions)

    assert (trajectory.observation.shape == (
        batch_size,
        horizon,
    ) + observation_space.shape)
示例#2
0
def get_optimiser_and_environment_model(
        time_step_space,
        observation_space,
        action_space,
        population_size,
        number_of_particles,
        horizon,
        optimiser_policy_trajectory_optimiser_factory,
        sample_shape=(),
):
    reward = ConstantReward(observation_space, action_space, -1.0)

    batched_transition_network = DummyEnsembleTransitionNetwork(
        observation_space)
    batched_transition_model = KerasTransitionModel(
        [batched_transition_network],
        observation_space,
        action_space,
    )

    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=sample_shape)
    environment_model = EnvironmentModel(
        transition_model=batched_transition_model,
        reward_model=reward,
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=population_size,
    )
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, horizon, population_size,
        number_of_particles)
    return trajectory_optimiser, environment_model
示例#3
0
def test_generate_virtual_rollouts(observation_space, action_space, batch_size,
                                   horizon):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    random_policy = RandomTFPolicy(time_step_spec(observation_space),
                                   action_space)

    replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver(
        env_model, random_policy, horizon)

    driver.run(wrapped_env_model.reset())
    trajectory = replay_buffer.gather_all()

    mid_steps = repeat(1, horizon - 1)
    expected_step_types = tf.constant(list(chain([0], mid_steps, [2])))
    batched_step_types = replicate(expected_step_types, (batch_size, ))
    np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
示例#4
0
def _create_env_model(observation_space, action_space):
    batch_size = 3
    time_limit = 5

    terminations = MutableBatchConstantTermination(observation_space, batch_size)
    observation = create_uniform_distribution_from_spec(observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = TFTimeLimit(
        EnvironmentModel(
            transition_model=model,
            reward_model=ConstantReward(observation_space, action_space, -1.0),
            termination_model=terminations,
            initial_state_distribution_model=DeterministicInitialStateModel(observation),
            batch_size=batch_size,
        ),
        duration=time_limit,
    )

    actions = create_uniform_distribution_from_spec(action_space).sample((batch_size,))

    # Initial time step
    env_model.reset()

    observations = np.squeeze(
        np.repeat(np.expand_dims(observation, axis=0), batch_size, axis=0)
    )
    return terminations, observations, actions, env_model
def test_batched_environment_model(observation_space, action_space,
                                   batch_size):
    transition_network = DummyEnsembleTransitionNetwork(observation_space)
    transition_model = KerasTransitionModel(
        [transition_network],
        observation_space,
        action_space,
    )
    reward = ConstantReward(observation_space, action_space, 0.0)
    termination = ConstantFalseTermination(observation_space)
    initial_state_sampler = create_uniform_initial_state_distribution(
        observation_space)

    env_model = EnvironmentModel(transition_model, reward, termination,
                                 initial_state_sampler, batch_size)
    action_distr = create_uniform_distribution_from_spec(action_space)
    single_action = action_distr.sample()
    batch_actions = tf.convert_to_tensor(
        [single_action for _ in range(batch_size)])

    first_step = env_model.reset()
    assert (first_step.step_type == [
        StepType.FIRST for _ in range(batch_size)
    ]).numpy().all()
    assert first_step.observation.shape == [batch_size] + list(
        observation_space.shape)

    next_step = env_model.step(batch_actions)
    assert (next_step.step_type == [StepType.MID
                                    for _ in range(batch_size)]).numpy().all()
    assert next_step.observation.shape == [batch_size] + list(
        observation_space.shape)
    assert next_step.reward.shape == [batch_size]
示例#6
0
def _create_wrapped_environment(observation_space, action_space, reward):
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    return EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space, reward),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
示例#7
0
def test_tf_env_wrapper_is_reset_at_the_start_of_each_iteration(action_space):

    observations_array = [
        # First iteration
        [StepType.FIRST, StepType.FIRST],
        [StepType.LAST, StepType.MID],
        [StepType.FIRST, StepType.MID],
        [StepType.LAST, StepType.LAST],
        # Second iteration
        [StepType.FIRST, StepType.FIRST],
        [StepType.MID, StepType.MID],
        [StepType.MID, StepType.MID],
        [StepType.MID, StepType.LAST],
        [StepType.MID, StepType.FIRST],
    ]
    observations = [
        tf.concat(ob_array, axis=0) for ob_array in observations_array
    ]

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=2,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False,
                            validate_args=False)
    policy_state_updater = StubPolicyStateUpdater()
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=3,
        population_size=2,
        max_iterations=2,
        policy_state_updater=policy_state_updater,
    )

    initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST),
                                               axis=0),
                                batch_size=1)

    trajectory_optimiser.optimise(initial_time_step, environment_model)

    for stored_trajectories in policy_state_updater.step_types:
        np.testing.assert_equal(stored_trajectories[:, 0], np.array([0, 0]))
示例#8
0
def _fixture(observation_space, action_space, batch_size):
    observation_distr = create_uniform_distribution_from_spec(
        observation_space)
    batch_observations = observation_distr.sample(batch_size)

    reward = ConstantReward(observation_space, action_space, REWARD_TARGET)
    action_distr = create_uniform_distribution_from_spec(action_space)
    batch_actions = action_distr.sample(batch_size)

    return reward, batch_observations, batch_actions, batch_size
示例#9
0
def test_trajectory_optimiser_pathological_trajectories(
        action_space, horizon, batch_size):
    """
    The replay buffer is a FIFO buffer of fixed capacity. Ensure that the capacity is sufficient
    such that the initial observation is still present in the buffer even in the pathological case
    where all trajectories are of length 2.
    """

    # construct the environment model
    observations = list(
        chain.from_iterable(
            repeat(
                [
                    replicate(tf.constant(StepType.FIRST), [batch_size]),
                    replicate(tf.constant(StepType.LAST), [batch_size]),
                ],
                horizon,
            )))

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=batch_size,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)
    policy = RandomTFPolicy(time_step_space, action_space)
    stub_policy_state_updater = StubPolicyStateUpdater()
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon,
        population_size=batch_size,
        max_iterations=1,
        policy_state_updater=stub_policy_state_updater,
    )

    time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0),
                        batch_size=1)

    trajectory_optimiser.optimise(time_step, environment_model)

    stored_trajectory = stub_policy_state_updater.step_types[0]
    assert stored_trajectory[0][0] == StepType.FIRST
def get_cross_entropy_policy(observation_space, action_space, horizon,
                             batch_size):
    time_step_space = time_step_spec(observation_space)
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        batch_size,
    )
    policy = CrossEntropyMethodPolicy(time_step_space, action_space, horizon,
                                      batch_size)
    return env_model, policy
示例#11
0
def _transition_fixture(mountain_car_environment, batch_size):
    network = LinearTransitionNetwork(
        mountain_car_environment.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        mountain_car_environment.observation_spec(),
        mountain_car_environment.action_spec(),
    )

    reward_model = ConstantReward(mountain_car_environment.observation_spec(),
                                  mountain_car_environment.action_spec())

    transition = sample_uniformly_distributed_transitions(
        transition_model, 2 * batch_size, reward_model)

    return mountain_car_environment, transition
def test_train_method_increments_counter_for_generic_background_planning(
        mocker, agent_class):
    """
    The docstring for the `_train` method of a TFAgent requires that the implementation increments
    the `train_step_counter`.
    """
    population_size = 1
    horizon = 10
    model_free_training_iterations = 1

    mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC,
                                            agent_class)
    network = LinearTransitionNetwork(OBSERVATION_SPEC)
    transition_model = KerasTransitionModel([network], OBSERVATION_SPEC,
                                            ACTION_SPEC)
    reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC)
    initial_state_model = create_uniform_initial_state_distribution(
        OBSERVATION_SPEC)

    train_step_counter = common.create_variable("train_step_counter",
                                                shape=(),
                                                dtype=tf.float64)
    model_based_agent = BackgroundPlanningAgent(
        (transition_model, TransitionModelTrainingSpec(1, 1)),
        reward_model,
        initial_state_model,
        mf_agent,
        population_size,
        horizon,
        model_free_training_iterations,
        train_step_counter=train_step_counter,
    )

    dummy_trajectories = generate_dummy_trajectories(
        OBSERVATION_SPEC,
        ACTION_SPEC,
        batch_size=population_size,
        trajectory_length=horizon)
    train_kwargs = {
        TRAIN_ARGSPEC_COMPONENT_ID: EnvironmentModelComponents.TRANSITION.value
    }
    model_based_agent.train(dummy_trajectories, **train_kwargs)

    assert train_step_counter.value() == 1
示例#13
0
def _wrapped_environment_fixture(observation_space, action_space, batch_size):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    wrapped_environment_model = TFTimeLimit(env_model, 2)

    action = create_uniform_distribution_from_spec(action_space).sample(
        (batch_size, ))

    return wrapped_environment_model, action
示例#14
0
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space):
    """
    This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used
    to generate a rollout. This also serves as an example of how to construct "random shooting"
    rollouts from an environment model.

    The assertion in this test is that selected action has the expected log_prob value consistent
    with optimisers from a uniform distribution. All this is really checking is that the preceeding
    code has run successfully.
    """

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space, action_space)
    wrapped_environment = EnvironmentModel(
        environment,
        ConstantReward(observation_space, action_space, 0.0),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )

    random_policy = RandomTFPolicy(
        wrapped_environment.time_step_spec(), action_space, emit_log_probability=True
    )

    transition_observer = _RecordLastLogProbTransitionObserver()

    driver = DynamicStepDriver(
        env=wrapped_environment,
        policy=random_policy,
        transition_observers=[transition_observer],
    )
    driver.run()

    last_log_prob = transition_observer.last_log_probability

    uniform_distribution = create_uniform_distribution_from_spec(action_space)
    action_log_prob = uniform_distribution.log_prob(transition_observer.action)
    expected = np.sum(action_log_prob.numpy().astype(np.float32))
    actual = np.sum(last_log_prob.numpy())

    np.testing.assert_array_almost_equal(actual, expected, decimal=4)
def test_train_method_increments_counter_for_model_free_supported_agents(
    mocker, agent_class, train_component
):
    """
    The docstring for the `_train` method of a TFAgent requires that the implementation increments
    the `train_step_counter`.
    """
    population_size = 1
    number_of_particles = 1
    horizon = 10

    mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC, agent_class)
    trajectory_optimiser = random_shooting_trajectory_optimisation(
        TIMESTEP_SPEC, ACTION_SPEC, horizon, population_size, number_of_particles
    )
    network = LinearTransitionNetwork(OBSERVATION_SPEC)
    transition_model = KerasTransitionModel([network], OBSERVATION_SPEC, ACTION_SPEC)
    reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC)
    initial_state_model = create_uniform_initial_state_distribution(OBSERVATION_SPEC)

    train_step_counter = common.create_variable(
        "train_step_counter", shape=(), dtype=tf.float64
    )
    agent = ModelFreeSupportedDecisionTimePlanningAgent(
        TIMESTEP_SPEC,
        ACTION_SPEC,
        (transition_model, TransitionModelTrainingSpec(1, 1)),
        reward_model,
        initial_state_model,
        trajectory_optimiser,
        mf_agent,
        train_step_counter=train_step_counter,
    )

    dummy_trajectories = generate_dummy_trajectories(
        OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon
    )
    train_kwargs = {TRAIN_ARGSPEC_COMPONENT_ID: train_component.value}
    agent.train(dummy_trajectories, **train_kwargs)

    assert train_step_counter.value() == 1
def test_sample_trajectory_for_mountain_car():
    tf_env = tf_py_environment.TFPyEnvironment(
        suite_gym.load("MountainCar-v0"))

    network = LinearTransitionNetwork(tf_env.observation_spec())
    model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(),
                            -1.0)
    terminates = MountainCarTermination(tf_env.observation_spec())
    initial_state_sampler = MountainCarInitialState(tf_env.observation_spec())
    environment = TFTimeLimit(EnvironmentModel(model, reward, terminates,
                                               initial_state_sampler),
                              duration=200)

    collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                    tf_env.action_spec())
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_episodes_per_iteration = 2
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration,
    )

    collect_driver.run()

    trajectory = policy_training_buffer.gather_all()

    first_batch_step_type = trajectory.step_type[0, :]
    assert (first_batch_step_type[0] == StepType.FIRST
            and first_batch_step_type[-1] == StepType.LAST)
示例#17
0
def test_tf_time_limit_wrapper_with_environment_model(observation_space,
                                                      action_space,
                                                      trajectory_length):
    """
    This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit`
    environment wrapper from TF-Agents.
    """
    ts_spec = time_step_spec(observation_space)

    network = LinearTransitionNetwork(observation_space)
    environment = KerasTransitionModel([network], observation_space,
                                       action_space)
    wrapped_environment = TFTimeLimit(
        EnvironmentModel(
            environment,
            ConstantReward(observation_space, action_space, 0.0),
            ConstantFalseTermination(observation_space),
            create_uniform_initial_state_distribution(observation_space),
        ),
        trajectory_length,
    )

    collect_policy = RandomTFPolicy(ts_spec, action_space)
    replay_buffer_capacity = 1001
    policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        wrapped_environment,
        collect_policy,
        observers=[policy_training_buffer.add_batch],
        num_episodes=1,
    )
    collect_driver.run()

    trajectories = policy_training_buffer.gather_all()

    assert trajectories.step_type.shape == (1, trajectory_length + 1)
示例#18
0
def test_invalid_num_elites(observation_space, action_space, horizon):

    # some fixed parameters
    population_size = 10
    number_of_particles = 1

    # set up the environment model
    network = LinearTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    environment_model = EnvironmentModel(
        model,
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
        population_size,
    )

    # set up the trajectory optimizer
    time_step_space = time_step_spec(observation_space)
    optimiser = cross_entropy_method_trajectory_optimisation(
        time_step_space,
        action_space,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        num_elites=population_size + 1,
        learning_rate=0.1,
        max_iterations=1,
    )

    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    initial_time_step = restart(observation, batch_size=1)

    # run
    with pytest.raises(AssertionError) as excinfo:
        optimiser.optimise(initial_time_step, environment_model)

    assert "num_elites" in str(excinfo)
def test_train_oracle_transition_model():
    """
    Ensure that a non-trainable oracle transition model does not cause the agent `train` method to
    fail.
    """
    population_size = 1
    number_of_particles = 1
    horizon = 10

    trajectory_optimiser = random_shooting_trajectory_optimisation(
        TIMESTEP_SPEC, ACTION_SPEC, horizon, population_size, number_of_particles
    )
    transition_model = StubTransitionModel(OBSERVATION_SPEC, ACTION_SPEC)
    reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC)
    initial_state_model = create_uniform_initial_state_distribution(OBSERVATION_SPEC)

    train_step_counter = common.create_variable(
        "train_step_counter", shape=(), dtype=tf.float64
    )
    with pytest.warns(RuntimeWarning):
        agent = DecisionTimePlanningAgent(
            TIMESTEP_SPEC,
            ACTION_SPEC,
            transition_model,
            reward_model,
            initial_state_model,
            trajectory_optimiser,
            train_step_counter=train_step_counter,
        )

    dummy_trajectories = generate_dummy_trajectories(
        OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon
    )
    train_kwargs = {TRAIN_ARGSPEC_COMPONENT_ID: EnvironmentModelComponents.TRANSITION.value}
    loss_info = agent.train(dummy_trajectories, **train_kwargs)

    assert loss_info.loss is None
    assert loss_info.extra is None
示例#20
0
def test_trajectory_optimiser_with_particles_actions_shape(
        action_space, horizon, population_size, number_of_particles):
    observation = create_uniform_distribution_from_spec(
        OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size *
                                                     number_of_particles, ))
    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, repeat(observation))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=population_size * number_of_particles,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = RandomTFPolicy(time_step_space,
                            action_space,
                            automatic_state_reset=False)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=population_size,
        number_of_particles=number_of_particles,
        max_iterations=2,
    )

    initial_time_step = restart(tf.expand_dims(observation[0], axis=0))
    optimal_actions = trajectory_optimiser.optimise(initial_time_step,
                                                    environment_model)

    assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
示例#21
0
def test_mismatch_between_optimizer_and_environment_model_batch_size(
        observation_space, action_space,
        optimiser_policy_trajectory_optimiser_factory):
    time_step_space = time_step_spec(observation_space)
    environment_model = EnvironmentModel(
        StubTrainableTransitionModel(observation_space,
                                     action_space,
                                     predict_state_difference=True),
        ConstantReward(observation_space, action_space),
        ConstantFalseTermination(observation_space),
        create_uniform_initial_state_distribution(observation_space),
    )
    population_size = environment_model.batch_size + 1
    trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory(
        time_step_space, action_space, 1, population_size, 1)
    # remember the time step comes from the real environment with batch size 1
    observation = create_uniform_distribution_from_spec(
        observation_space).sample(sample_shape=(1, ))
    time_step = restart(observation, batch_size=1)
    with pytest.raises(AssertionError) as excinfo:
        _ = trajectory_optimiser.optimise(time_step, environment_model)

    assert "batch_size parameter is not equal to environment_model.batch_size" in str(
        excinfo)
示例#22
0
batch_size = 64

training_spec = KerasTrainingSpec(
    epochs=5000,
    training_batch_size=256,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
    verbose=0,
)
linear_transition_network = LinearTransitionNetwork(tf_env.observation_spec())
trajectory_sampling_strategy = InfiniteHorizonTrajectorySampling(batch_size, 1)
transition_model = KerasTransitionModel(
    [linear_transition_network],
    tf_env.observation_spec(),
    tf_env.action_spec(),
)
reward_model = ConstantReward(tf_env.observation_spec(), tf_env.action_spec())
sample_transitions = sample_uniformly_distributed_transitions(
    transition_model, 1000, reward_model
)

# %%
plot_mountain_car_transitions(
    sample_transitions.observation.numpy(),
    sample_transitions.action.numpy(),
    sample_transitions.next_observation.numpy(),
)

# %% [markdown]
"""
## TF-Agents Agent
示例#23
0
def test_trajectory_optimiser_each_iteration_starts_with_the_initial_observation(
        action_space, horizon, batch_size, max_iterations):
    class WrappedRandomTFPolicy(TFPolicy):
        def __init__(
            self,
            ts_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            env_model: EnvironmentModel,
        ):
            super().__init__(ts_spec, action_spec)

            self._internal_policy = RandomTFPolicy(ts_spec, action_space)

            self._environment_model = env_model

        def _action(
            self,
            time_step: ts.TimeStep,
            policy_state: types.NestedTensor,
            seed: Optional[types.Seed],
        ) -> policy_step.PolicyStep:
            np.testing.assert_array_equal(
                time_step.observation,
                self._environment_model.current_time_step().observation)
            return self._internal_policy._action(time_step, policy_state, seed)

        def _distribution(
                self, time_step: ts.TimeStep,
                policy_state: types.NestedTensorSpec
        ) -> policy_step.PolicyStep:
            raise NotImplementedError()

    observations = list(
        repeat(replicate(tf.constant(StepType.MID), [batch_size]),
               max_iterations * (horizon + 1)))

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=batch_size,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = WrappedRandomTFPolicy(time_step_space, action_space,
                                   environment_model)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=batch_size,
        max_iterations=max_iterations,
    )

    initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST),
                                               axis=0),
                                batch_size=1)

    trajectory_optimiser.optimise(initial_time_step, environment_model)