def test_ensemble_size_set_correctly(): """ For ensemble transition models ensemble size needs to be larger than 1. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState( observation_spec) # trajectory optimiser trajectory_optimiser_type = TrajectoryOptimizationType.CrossEntropyMethod transition_model_type = TransitionModelType.DeterministicEnsemble trajectory_sampler_type = TrajectorySamplerType.TS1 # some parameters need to be set correctly ensemble_size = 1 population_size = 10 number_of_particles = 1 horizon = 1 # define agent, many transition model and trajectory optimiser parameters can # be arbitrary with pytest.raises(AssertionError) as excinfo: PetsAgent( time_step_spec, action_spec, transition_model_type, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler_type, trajectory_optimiser_type, horizon, population_size, number_of_particles, ) assert "ensemble_size should be > 1" in str(excinfo.value)
def test_planning_policy_batch_environment_model(): """ Ensure that planning policy is operational. """ # number of trajectories for planning and planning horizon population_size = 3 planner_horizon = 5 number_of_particles = 1 # setup the environment and a model of it py_env = suite_gym.load("MountainCar-v0") tf_env = TFPyEnvironment(py_env) reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec()) terminates = MountainCarTermination(tf_env.observation_spec()) network = LinearTransitionNetwork(tf_env.observation_spec()) transition_model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) initial_state = MountainCarInitialState(tf_env.observation_spec()) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=terminates, initial_state_distribution_model=initial_state, ) # setup the trajectory optimiser random_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy, planner_horizon, population_size, number_of_particles) planning_policy = PlanningPolicy(environment_model, trajectory_optimiser) # test whether it runs collect_driver_planning_policy = DynamicEpisodeDriver(tf_env, planning_policy, num_episodes=1) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
def test_sample_trajectory_for_mountain_car(): tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load("MountainCar-v0")) network = LinearTransitionNetwork(tf_env.observation_spec()) model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) reward = ConstantReward(tf_env.observation_spec(), tf_env.action_spec(), -1.0) terminates = MountainCarTermination(tf_env.observation_spec()) initial_state_sampler = MountainCarInitialState(tf_env.observation_spec()) environment = TFTimeLimit(EnvironmentModel(model, reward, terminates, initial_state_sampler), duration=200) collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) replay_buffer_capacity = 1001 policy_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) collect_episodes_per_iteration = 2 collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( environment, collect_policy, observers=[policy_training_buffer.add_batch], num_episodes=collect_episodes_per_iteration, ) collect_driver.run() trajectory = policy_training_buffer.gather_all() first_batch_step_type = trajectory.step_type[0, :] assert (first_batch_step_type[0] == StepType.FIRST and first_batch_step_type[-1] == StepType.LAST)
sample_transitions.next_observation.numpy(), ) # %% [markdown] """ ## Training on samples We define an environment which uses the trained transition model for the dynamics, along with a reward function, episode termination condition, initial state distributions and bound on episode length. """ # %% reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec()) terminates = MountainCarTermination(tf_env.observation_spec()) initial_state_distribution = MountainCarInitialState(tf_env.observation_spec()) environment_model = TFTimeLimit( EnvironmentModel(transition_model, reward, terminates, initial_state_distribution), duration=200, ) # %% [markdown] """ The agent is trained on data gathered from the environment model. Using the environment interface means the TF-Agents drivers can be used to generate rollouts. """ # %% replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=tf_env.batch_size,
def test_all_mepo_variants_work(transition_model, trajectory_sampler, model_free_agent_type): """ Mepo Agent has prespecified transition model, trajectory sampler and model-free agent types. Here we check that all combinations execute without errors. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState( observation_spec) # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 population_size = num_elites + 10 horizon = 1 # define agent, many transition model and trajectory optimiser parameters can # be arbitrary agent = MepoAgent( time_step_spec, action_spec, transition_model, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, ) # we need some training data random_policy = RandomTFPolicy( time_step_spec, action_spec, info_spec=agent.collect_policy.info_spec, ) model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( random_policy.trajectory_spec, batch_size=1, max_length=1000) collect_driver_random_policy = TFDriver( tf_env, random_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) initial_time_step = tf_env.reset() collect_driver_random_policy.run(initial_time_step) pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10) tf_training_scheduler = pets_agent_trainer.create_training_scheduler( agent, model_training_buffer) training_losses = tf_training_scheduler.maybe_train( tf.constant(10, dtype=tf.int64)) assert EnvironmentModelComponents.TRANSITION in training_losses # test the agent collect_driver_planning_policy = TFDriver( tf_env, agent.collect_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)