예제 #1
0
    def test_train_tf_agetns(self):
        env_name = "CartPole-v0"
        model_name = "tf_agents_dqn"
        env = gym.make(env_name)
        train_env = environment_converter.gym_to_tf(env)
        fc_layer_params = (100, )
        q_net = q_network.QNetwork(
            input_tensor_spec=train_env.observation_spec(),
            action_spec=train_env.action_spec(),
            fc_layer_params=fc_layer_params,
        )
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
        agent = dqn_agent.DqnAgent(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
        )
        agent.initialize()

        train(
            model=agent,
            env=env,
            total_timesteps=1500,
            stop_threshold=4000,
            model_name=model_name,
            maximum_episode_reward=195,
        )
        trained_env = get_saved_environments()[0]
        trained_models = get_trained_model_names(trained_env)
        model_saved = model_name in trained_models
        shutil.rmtree(save_path)
        self.assertTrue(model_saved)
예제 #2
0
    def test_tf_agents_on_policy_agent(self):
        learning_rate = 1e-3
        actor_fc_layers = (200, 100)
        value_fc_layers = (200, 100)
        env_name = "CartPole-v0"
        gym_env = gym.make(env_name)
        model_name = "ppo_tf_agent"
        train_env = environment_converter.gym_to_tf(gym_env)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=actor_fc_layers,
        )
        value_net = value_network.ValueNetwork(train_env.observation_spec(),
                                               fc_layer_params=value_fc_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)
        agent = ppo_agent.PPOAgent(
            train_env.time_step_spec(),
            train_env.action_spec(),
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
        )
        agent.initialize()

        # Train
        train(agent, gym_env, 2000, 195, model_name, 200)
        trained_env = get_saved_environments()[0]
        trained_models = get_trained_model_names(trained_env)
        model_saved = model_name in trained_models
        shutil.rmtree(save_path)
        self.assertTrue(model_saved)
예제 #3
0
 def test_multiple_stable_baselines(self):
     env_name = "CartPole-v0"
     env = gym.make(env_name)
     models = [
         DQN("MlpPolicy", gym.make(env_name), learning_rate=1e-3),
         A2C(policy="MlpPolicy", env=gym.make(env_name), verbose=1),
         PPO(policy="MlpPolicy", env=gym.make(env_name), verbose=1),
     ]
     model_names = ["Simple DQN", "A2C", "PPO"]
     train_multiple(models, env, 1470, 195, model_names, 200)
     trained_env = get_saved_environments()[0]
     trained_models = get_trained_model_names(trained_env)
     model_saved = set(model_names) == set(trained_models)
     shutil.rmtree(save_path)
     self.assertTrue(model_saved)
예제 #4
0
 def test_train_stable_baselines(self):
     env = gym.make("CartPole-v0")
     model_name = "dqn_test"
     model = DQN(
         policy="MlpPolicy",
         env=env,
         learning_rate=1e-3,
         verbose=1,
     )
     train(
         model=model,
         env=env,
         total_timesteps=1500,
         stop_threshold=4000,
         model_name=model_name,
         maximum_episode_reward=195,
     )
     trained_env = get_saved_environments()[0]
     trained_models = get_trained_model_names(trained_env)
     model_saved = model_name in trained_models
     shutil.rmtree(save_path)
     self.assertTrue(model_saved)
예제 #5
0
def check_models_are_saved(model_names: typing.List[str]) -> bool:
    trained_env = get_saved_environments()[0]
    trained_models = get_trained_model_names(trained_env)
    return set(model_names) == set(trained_models)
예제 #6
0
def check_model_is_saved(model_name: str) -> bool:
    trained_env = get_saved_environments()[0]
    trained_models = get_trained_model_names(trained_env)
    return model_name in set(trained_models)
예제 #7
0
    def test_multiple_tf_agents(self):
        env_name = "CartPole-v0"
        # DQN
        env = gym.make(env_name)
        train_env = environment_converter.gym_to_tf(env)
        fc_layer_params = (100, )
        q_net = q_network.QNetwork(
            input_tensor_spec=train_env.observation_spec(),
            action_spec=train_env.action_spec(),
            fc_layer_params=fc_layer_params,
        )
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
        dqn_tf_agent = dqn_agent.DqnAgent(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
        )
        dqn_tf_agent.initialize()

        # PPO
        env = gym.make(env_name)
        actor_fc_layers = (200, 100)
        value_fc_layers = (200, 100)
        learning_rate = 1e-3
        train_env = environment_converter.gym_to_tf(env)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=actor_fc_layers,
        )
        value_net = value_network.ValueNetwork(train_env.observation_spec(),
                                               fc_layer_params=value_fc_layers)
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        ppo_tf_agent = ppo_agent.PPOAgent(
            train_env.time_step_spec(),
            train_env.action_spec(),
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
        )
        ppo_tf_agent.initialize()

        # REINFORCE:
        env = gym.make(env_name)
        train_env = environment_converter.gym_to_tf(env)
        learning_rate = 1e-3
        fc_layer_params = (100, )
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=fc_layer_params,
        )
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)
        train_step_counter = tf.compat.v2.Variable(0)
        reinforce_tf_agent = reinforce_agent.ReinforceAgent(
            train_env.time_step_spec(),
            train_env.action_spec(),
            actor_network=actor_net,
            optimizer=optimizer,
            normalize_returns=True,
            train_step_counter=train_step_counter,
        )
        reinforce_tf_agent.initialize()

        agents = [dqn_tf_agent, ppo_tf_agent, reinforce_tf_agent]
        agent_names = ["dqn_agent", "ppo_agent", "reinforce_agent"]

        train_multiple(agents, env, 1470, 195, agent_names, 200)

        trained_env = get_saved_environments()[0]
        trained_models = get_trained_model_names(trained_env)
        model_saved = set(agent_names) == set(trained_models)
        shutil.rmtree(save_path)
        self.assertTrue(model_saved)