예제 #1
0
def run_task(*_):
    sess = tf.Session()
    sess.__enter__()
    snapshot = joblib.load(latent_policy_pkl)
    latent_policy = snapshot["policy"]
    inner_env = SimpleReacherEnv(goal_position=(0.65, 0.3, 0.3),
                                 control_method="position_control",
                                 completion_bonus=30)

    env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env,
        hidden_sizes=(64, 64),
        init_std=20,
        # std_share_network=False,
        # adaptive_std=True
    )
    baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,  # 4096
        max_path_length=100,
        n_itr=1500,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=1e-6,
        plot=True,
    )
    algo.train(sess=sess)
예제 #2
0
def run_task(*_):
    sess = tf.Session()
    sess.__enter__()
    latent_policy = joblib.load(latent_policy_pkl)["policy"]

    inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100)
    env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(64, 64),
                               init_std=20,
                               std_share_network=False,
                               adaptive_std=True)
    baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,  # 4096
        max_path_length=50,
        n_itr=1500,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=1e-6,
        plot=True,
        use_mpc_es=True,
    )
    algo.train(sess=sess)
예제 #3
0
def run_task(*_):
    """
    Wrap PPO training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))

    policy = GaussianMLPPolicy(
        name="policy", env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2048,
        max_path_length=100,
        n_itr=488,
        discount=0.99,
        step_size=0.01,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=False)
    algo.train()
예제 #4
0
def run_task(v):
    v = SimpleNamespace(**v)

    # Environment
    env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5)

    env = TfEnv(env)

    # Policy
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=(64, 32),
        init_std=v.policy_init_std,
    )

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=1000,
        discount=0.99,
        step_size=0.2,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=True,
    )
    algo.train()
예제 #5
0
def run_task(v):
    v = SimpleNamespace(**v)

    # Environment
    env = SimpleReacherEnv(
        goal_position=GOALS[0],
        control_method="position_control",
        # control_cost_coeff=1.0,
        action_scale=0.04,
        randomize_start_jpos=True,
        completion_bonus=0.1,
        # terminate_on_collision=True,
        collision_penalty=0.0,
    )
    env = TfEnv(env)

    # Policy
    policy = GaussianMLPPolicy(
        name="Policy",
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        std_share_network=True,
        init_std=v.policy_init_std,
    )

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args=dict(hidden_sizes=(64, 64)),
    )

    # baseline = CollisionAwareBaseline(
    #     env_spec=env.spec,
    #     regressor_args=dict(hidden_sizes=(64, 64)),
    # )

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=10000,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=0.,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=True,
    )
    algo.train()
예제 #6
0
 def test_ppo_pendulum(self):
     """Test PPO with Pendulum environment."""
     logger._tensorboard = TensorBoardOutput()
     env = TfEnv(normalize(gym.make("Pendulum-v0")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(32, 32),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     algo = PPO(
         env=env,
         policy=policy,
         baseline=baseline,
         batch_size=1024,
         max_path_length=100,
         n_itr=10,
         discount=0.99,
         gae_lambda=0.98,
         policy_ent_coeff=0.0,
         plot=False,
     )
     last_avg_ret = algo.train(sess=self.sess)
     assert last_avg_ret > -1000
예제 #7
0
    def test_ppo_pendulum_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicyWithModel(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            lr_clip_range=0.01,
            optimizer_args=dict(batch_size=32, max_epochs=10),
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 40

        env.close()
예제 #8
0
def run_task(v):
    v = SimpleNamespace(**v)

    # Environment
    env = SimplePusherEnv(action_scale=0.04,
                          control_method="position_control",
                          completion_bonus=0.1,
                          collision_penalty=0.05)

    env = TfEnv(env)

    # Policy
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=(256, 128),
        init_std=v.policy_init_std,
    )

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args=dict(hidden_sizes=(256, 128)),
    )

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=2000,
        discount=0.99,
        step_size=0.2,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=True,
    )
    algo.train()
예제 #9
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:
    """
    ext.set_seed(seed)

    with tf.Graph().as_default():
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=True,
            ),
        )

        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=488,
            discount=0.99,
            gae_lambda=0.95,
            clip_range=0.1,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(
                    learning_rate=3e-4,
                    epsilon=1e-5,
                ),
            ),
            plot=False,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(log_dir)

        algo.train()

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file