def test_replay_actions_across_batches(observation_space, action_space, horizon, batch_size): transition_network = DummyEnsembleTransitionNetwork(observation_space) transition_model = KerasTransitionModel( [transition_network], observation_space, action_space, ) reward = ConstantReward(observation_space, action_space, 0.0) termination = ConstantFalseTermination(observation_space) initial_state_sampler = create_uniform_initial_state_distribution( observation_space) env_model = TFTimeLimit( EnvironmentModel(transition_model, reward, termination, initial_state_sampler, batch_size), horizon, ) actions_distribution = create_uniform_initial_state_distribution( observation_space) actions = actions_distribution.sample((horizon, )) trajectory = replay_actions_across_batch_transition_models( env_model, actions) assert (trajectory.observation.shape == ( batch_size, horizon, ) + observation_space.shape)
def get_optimiser_and_environment_model( time_step_space, observation_space, action_space, population_size, number_of_particles, horizon, optimiser_policy_trajectory_optimiser_factory, sample_shape=(), ): reward = ConstantReward(observation_space, action_space, -1.0) batched_transition_network = DummyEnsembleTransitionNetwork( observation_space) batched_transition_model = KerasTransitionModel( [batched_transition_network], observation_space, action_space, ) observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=sample_shape) environment_model = EnvironmentModel( transition_model=batched_transition_model, reward_model=reward, termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=population_size, ) trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory( time_step_space, action_space, horizon, population_size, number_of_particles) return trajectory_optimiser, environment_model
def test_generate_virtual_rollouts(observation_space, action_space, batch_size, horizon): observation = create_uniform_distribution_from_spec( observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=batch_size, ) random_policy = RandomTFPolicy(time_step_spec(observation_space), action_space) replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver( env_model, random_policy, horizon) driver.run(wrapped_env_model.reset()) trajectory = replay_buffer.gather_all() mid_steps = repeat(1, horizon - 1) expected_step_types = tf.constant(list(chain([0], mid_steps, [2]))) batched_step_types = replicate(expected_step_types, (batch_size, )) np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
def _create_env_model(observation_space, action_space): batch_size = 3 time_limit = 5 terminations = MutableBatchConstantTermination(observation_space, batch_size) observation = create_uniform_distribution_from_spec(observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = TFTimeLimit( EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=terminations, initial_state_distribution_model=DeterministicInitialStateModel(observation), batch_size=batch_size, ), duration=time_limit, ) actions = create_uniform_distribution_from_spec(action_space).sample((batch_size,)) # Initial time step env_model.reset() observations = np.squeeze( np.repeat(np.expand_dims(observation, axis=0), batch_size, axis=0) ) return terminations, observations, actions, env_model
def test_batched_environment_model(observation_space, action_space, batch_size): transition_network = DummyEnsembleTransitionNetwork(observation_space) transition_model = KerasTransitionModel( [transition_network], observation_space, action_space, ) reward = ConstantReward(observation_space, action_space, 0.0) termination = ConstantFalseTermination(observation_space) initial_state_sampler = create_uniform_initial_state_distribution( observation_space) env_model = EnvironmentModel(transition_model, reward, termination, initial_state_sampler, batch_size) action_distr = create_uniform_distribution_from_spec(action_space) single_action = action_distr.sample() batch_actions = tf.convert_to_tensor( [single_action for _ in range(batch_size)]) first_step = env_model.reset() assert (first_step.step_type == [ StepType.FIRST for _ in range(batch_size) ]).numpy().all() assert first_step.observation.shape == [batch_size] + list( observation_space.shape) next_step = env_model.step(batch_actions) assert (next_step.step_type == [StepType.MID for _ in range(batch_size)]).numpy().all() assert next_step.observation.shape == [batch_size] + list( observation_space.shape) assert next_step.reward.shape == [batch_size]
def _create_wrapped_environment(observation_space, action_space, reward): network = LinearTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) return EnvironmentModel( model, ConstantReward(observation_space, action_space, reward), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), )
def test_tf_env_wrapper_is_reset_at_the_start_of_each_iteration(action_space): observations_array = [ # First iteration [StepType.FIRST, StepType.FIRST], [StepType.LAST, StepType.MID], [StepType.FIRST, StepType.MID], [StepType.LAST, StepType.LAST], # Second iteration [StepType.FIRST, StepType.FIRST], [StepType.MID, StepType.MID], [StepType.MID, StepType.MID], [StepType.MID, StepType.LAST], [StepType.MID, StepType.FIRST], ] observations = [ tf.concat(ob_array, axis=0) for ob_array in observations_array ] transition_model = TrajectoryOptimiserTransitionModel( action_space, iter(observations)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = TrajectoryOptimiserTerminationModel( OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=2, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space, automatic_state_reset=False, validate_args=False) policy_state_updater = StubPolicyStateUpdater() trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon=3, population_size=2, max_iterations=2, policy_state_updater=policy_state_updater, ) initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0), batch_size=1) trajectory_optimiser.optimise(initial_time_step, environment_model) for stored_trajectories in policy_state_updater.step_types: np.testing.assert_equal(stored_trajectories[:, 0], np.array([0, 0]))
def _fixture(observation_space, action_space, batch_size): observation_distr = create_uniform_distribution_from_spec( observation_space) batch_observations = observation_distr.sample(batch_size) reward = ConstantReward(observation_space, action_space, REWARD_TARGET) action_distr = create_uniform_distribution_from_spec(action_space) batch_actions = action_distr.sample(batch_size) return reward, batch_observations, batch_actions, batch_size
def test_trajectory_optimiser_pathological_trajectories( action_space, horizon, batch_size): """ The replay buffer is a FIFO buffer of fixed capacity. Ensure that the capacity is sufficient such that the initial observation is still present in the buffer even in the pathological case where all trajectories are of length 2. """ # construct the environment model observations = list( chain.from_iterable( repeat( [ replicate(tf.constant(StepType.FIRST), [batch_size]), replicate(tf.constant(StepType.LAST), [batch_size]), ], horizon, ))) transition_model = TrajectoryOptimiserTransitionModel( action_space, iter(observations)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = TrajectoryOptimiserTerminationModel( OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=batch_size, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space) stub_policy_state_updater = StubPolicyStateUpdater() trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon, population_size=batch_size, max_iterations=1, policy_state_updater=stub_policy_state_updater, ) time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0), batch_size=1) trajectory_optimiser.optimise(time_step, environment_model) stored_trajectory = stub_policy_state_updater.step_types[0] assert stored_trajectory[0][0] == StepType.FIRST
def get_cross_entropy_policy(observation_space, action_space, horizon, batch_size): time_step_space = time_step_spec(observation_space) network = LinearTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( model, ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), batch_size, ) policy = CrossEntropyMethodPolicy(time_step_space, action_space, horizon, batch_size) return env_model, policy
def _transition_fixture(mountain_car_environment, batch_size): network = LinearTransitionNetwork( mountain_car_environment.observation_spec()) transition_model = KerasTransitionModel( [network], mountain_car_environment.observation_spec(), mountain_car_environment.action_spec(), ) reward_model = ConstantReward(mountain_car_environment.observation_spec(), mountain_car_environment.action_spec()) transition = sample_uniformly_distributed_transitions( transition_model, 2 * batch_size, reward_model) return mountain_car_environment, transition
def test_train_method_increments_counter_for_generic_background_planning( mocker, agent_class): """ The docstring for the `_train` method of a TFAgent requires that the implementation increments the `train_step_counter`. """ population_size = 1 horizon = 10 model_free_training_iterations = 1 mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC, agent_class) network = LinearTransitionNetwork(OBSERVATION_SPEC) transition_model = KerasTransitionModel([network], OBSERVATION_SPEC, ACTION_SPEC) reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC) initial_state_model = create_uniform_initial_state_distribution( OBSERVATION_SPEC) train_step_counter = common.create_variable("train_step_counter", shape=(), dtype=tf.float64) model_based_agent = BackgroundPlanningAgent( (transition_model, TransitionModelTrainingSpec(1, 1)), reward_model, initial_state_model, mf_agent, population_size, horizon, model_free_training_iterations, train_step_counter=train_step_counter, ) dummy_trajectories = generate_dummy_trajectories( OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon) train_kwargs = { TRAIN_ARGSPEC_COMPONENT_ID: EnvironmentModelComponents.TRANSITION.value } model_based_agent.train(dummy_trajectories, **train_kwargs) assert train_step_counter.value() == 1
def _wrapped_environment_fixture(observation_space, action_space, batch_size): observation = create_uniform_distribution_from_spec( observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=batch_size, ) wrapped_environment_model = TFTimeLimit(env_model, 2) action = create_uniform_distribution_from_spec(action_space).sample( (batch_size, )) return wrapped_environment_model, action
def test_random_shooting_with_dynamic_step_driver(observation_space, action_space): """ This test uses the environment wrapper as an adapter so that a driver from TF-Agents can be used to generate a rollout. This also serves as an example of how to construct "random shooting" rollouts from an environment model. The assertion in this test is that selected action has the expected log_prob value consistent with optimisers from a uniform distribution. All this is really checking is that the preceeding code has run successfully. """ network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ) random_policy = RandomTFPolicy( wrapped_environment.time_step_spec(), action_space, emit_log_probability=True ) transition_observer = _RecordLastLogProbTransitionObserver() driver = DynamicStepDriver( env=wrapped_environment, policy=random_policy, transition_observers=[transition_observer], ) driver.run() last_log_prob = transition_observer.last_log_probability uniform_distribution = create_uniform_distribution_from_spec(action_space) action_log_prob = uniform_distribution.log_prob(transition_observer.action) expected = np.sum(action_log_prob.numpy().astype(np.float32)) actual = np.sum(last_log_prob.numpy()) np.testing.assert_array_almost_equal(actual, expected, decimal=4)
def test_train_method_increments_counter_for_model_free_supported_agents( mocker, agent_class, train_component ): """ The docstring for the `_train` method of a TFAgent requires that the implementation increments the `train_step_counter`. """ population_size = 1 number_of_particles = 1 horizon = 10 mf_agent = create_mock_model_free_agent(mocker, TIMESTEP_SPEC, ACTION_SPEC, agent_class) trajectory_optimiser = random_shooting_trajectory_optimisation( TIMESTEP_SPEC, ACTION_SPEC, horizon, population_size, number_of_particles ) network = LinearTransitionNetwork(OBSERVATION_SPEC) transition_model = KerasTransitionModel([network], OBSERVATION_SPEC, ACTION_SPEC) reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC) initial_state_model = create_uniform_initial_state_distribution(OBSERVATION_SPEC) train_step_counter = common.create_variable( "train_step_counter", shape=(), dtype=tf.float64 ) agent = ModelFreeSupportedDecisionTimePlanningAgent( TIMESTEP_SPEC, ACTION_SPEC, (transition_model, TransitionModelTrainingSpec(1, 1)), reward_model, initial_state_model, trajectory_optimiser, mf_agent, train_step_counter=train_step_counter, ) dummy_trajectories = generate_dummy_trajectories( OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon ) train_kwargs = {TRAIN_ARGSPEC_COMPONENT_ID: train_component.value} agent.train(dummy_trajectories, **train_kwargs) assert train_step_counter.value() == 1
def test_sample_trajectory_for_mountain_car(): tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load("MountainCar-v0")) network = LinearTransitionNetwork(tf_env.observation_spec()) model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(), -1.0) terminates = MountainCarTermination(tf_env.observation_spec()) initial_state_sampler = MountainCarInitialState(tf_env.observation_spec()) environment = TFTimeLimit(EnvironmentModel(model, reward, terminates, initial_state_sampler), duration=200) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_episodes_per_iteration = 2 collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=collect_episodes_per_iteration, ) collect_driver.run() trajectory = policy_training_buffer.gather_all() first_batch_step_type = trajectory.step_type[0, :] assert (first_batch_step_type[0] == StepType.FIRST and first_batch_step_type[-1] == StepType.LAST)
def test_tf_time_limit_wrapper_with_environment_model(observation_space, action_space, trajectory_length): """ This test checks that the environment wrapper can in turn be wrapped by the `TimeLimit` environment wrapper from TF-Agents. """ ts_spec = time_step_spec(observation_space) network = LinearTransitionNetwork(observation_space) environment = KerasTransitionModel([network], observation_space, action_space) wrapped_environment = TFTimeLimit( EnvironmentModel( environment, ConstantReward(observation_space, action_space, 0.0), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ), trajectory_length, ) collect_policy = RandomTFPolicy(ts_spec, action_space) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( wrapped_environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=1, ) collect_driver.run() trajectories = policy_training_buffer.gather_all() assert trajectories.step_type.shape == (1, trajectory_length + 1)
def test_invalid_num_elites(observation_space, action_space, horizon): # some fixed parameters population_size = 10 number_of_particles = 1 # set up the environment model network = LinearTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) environment_model = EnvironmentModel( model, ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), population_size, ) # set up the trajectory optimizer time_step_space = time_step_spec(observation_space) optimiser = cross_entropy_method_trajectory_optimisation( time_step_space, action_space, horizon=horizon, population_size=population_size, number_of_particles=number_of_particles, num_elites=population_size + 1, learning_rate=0.1, max_iterations=1, ) # remember the time step comes from the real environment with batch size 1 observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=(1, )) initial_time_step = restart(observation, batch_size=1) # run with pytest.raises(AssertionError) as excinfo: optimiser.optimise(initial_time_step, environment_model) assert "num_elites" in str(excinfo)
def test_train_oracle_transition_model(): """ Ensure that a non-trainable oracle transition model does not cause the agent `train` method to fail. """ population_size = 1 number_of_particles = 1 horizon = 10 trajectory_optimiser = random_shooting_trajectory_optimisation( TIMESTEP_SPEC, ACTION_SPEC, horizon, population_size, number_of_particles ) transition_model = StubTransitionModel(OBSERVATION_SPEC, ACTION_SPEC) reward_model = ConstantReward(OBSERVATION_SPEC, ACTION_SPEC) initial_state_model = create_uniform_initial_state_distribution(OBSERVATION_SPEC) train_step_counter = common.create_variable( "train_step_counter", shape=(), dtype=tf.float64 ) with pytest.warns(RuntimeWarning): agent = DecisionTimePlanningAgent( TIMESTEP_SPEC, ACTION_SPEC, transition_model, reward_model, initial_state_model, trajectory_optimiser, train_step_counter=train_step_counter, ) dummy_trajectories = generate_dummy_trajectories( OBSERVATION_SPEC, ACTION_SPEC, batch_size=population_size, trajectory_length=horizon ) train_kwargs = {TRAIN_ARGSPEC_COMPONENT_ID: EnvironmentModelComponents.TRANSITION.value} loss_info = agent.train(dummy_trajectories, **train_kwargs) assert loss_info.loss is None assert loss_info.extra is None
def test_trajectory_optimiser_with_particles_actions_shape( action_space, horizon, population_size, number_of_particles): observation = create_uniform_distribution_from_spec( OBSERVATION_SPACE_SPEC).sample(sample_shape=(population_size * number_of_particles, )) transition_model = TrajectoryOptimiserTransitionModel( action_space, repeat(observation)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = ConstantFalseTermination(OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=population_size * number_of_particles, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space, automatic_state_reset=False) trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon=horizon, population_size=population_size, number_of_particles=number_of_particles, max_iterations=2, ) initial_time_step = restart(tf.expand_dims(observation[0], axis=0)) optimal_actions = trajectory_optimiser.optimise(initial_time_step, environment_model) assert optimal_actions.shape == (horizon + 1, ) + action_space.shape
def test_mismatch_between_optimizer_and_environment_model_batch_size( observation_space, action_space, optimiser_policy_trajectory_optimiser_factory): time_step_space = time_step_spec(observation_space) environment_model = EnvironmentModel( StubTrainableTransitionModel(observation_space, action_space, predict_state_difference=True), ConstantReward(observation_space, action_space), ConstantFalseTermination(observation_space), create_uniform_initial_state_distribution(observation_space), ) population_size = environment_model.batch_size + 1 trajectory_optimiser = optimiser_policy_trajectory_optimiser_factory( time_step_space, action_space, 1, population_size, 1) # remember the time step comes from the real environment with batch size 1 observation = create_uniform_distribution_from_spec( observation_space).sample(sample_shape=(1, )) time_step = restart(observation, batch_size=1) with pytest.raises(AssertionError) as excinfo: _ = trajectory_optimiser.optimise(time_step, environment_model) assert "batch_size parameter is not equal to environment_model.batch_size" in str( excinfo)
batch_size = 64 training_spec = KerasTrainingSpec( epochs=5000, training_batch_size=256, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], verbose=0, ) linear_transition_network = LinearTransitionNetwork(tf_env.observation_spec()) trajectory_sampling_strategy = InfiniteHorizonTrajectorySampling(batch_size, 1) transition_model = KerasTransitionModel( [linear_transition_network], tf_env.observation_spec(), tf_env.action_spec(), ) reward_model = ConstantReward(tf_env.observation_spec(), tf_env.action_spec()) sample_transitions = sample_uniformly_distributed_transitions( transition_model, 1000, reward_model ) # %% plot_mountain_car_transitions( sample_transitions.observation.numpy(), sample_transitions.action.numpy(), sample_transitions.next_observation.numpy(), ) # %% [markdown] """ ## TF-Agents Agent
def test_trajectory_optimiser_each_iteration_starts_with_the_initial_observation( action_space, horizon, batch_size, max_iterations): class WrappedRandomTFPolicy(TFPolicy): def __init__( self, ts_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, env_model: EnvironmentModel, ): super().__init__(ts_spec, action_spec) self._internal_policy = RandomTFPolicy(ts_spec, action_space) self._environment_model = env_model def _action( self, time_step: ts.TimeStep, policy_state: types.NestedTensor, seed: Optional[types.Seed], ) -> policy_step.PolicyStep: np.testing.assert_array_equal( time_step.observation, self._environment_model.current_time_step().observation) return self._internal_policy._action(time_step, policy_state, seed) def _distribution( self, time_step: ts.TimeStep, policy_state: types.NestedTensorSpec ) -> policy_step.PolicyStep: raise NotImplementedError() observations = list( repeat(replicate(tf.constant(StepType.MID), [batch_size]), max_iterations * (horizon + 1))) transition_model = TrajectoryOptimiserTransitionModel( action_space, iter(observations)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = TrajectoryOptimiserTerminationModel( OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=batch_size, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = WrappedRandomTFPolicy(time_step_space, action_space, environment_model) trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon=horizon, population_size=batch_size, max_iterations=max_iterations, ) initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0), batch_size=1) trajectory_optimiser.optimise(initial_time_step, environment_model)