示例#1
0
def run_task(v):
    env = TfEnv(normalize(CartpoleEnv()))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers,
        # each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()
示例#2
0
def run_task(v):
    """
    We wrap the main training loop in the run_task function so that
    run_experiment can easily execute variants of the experiment on different
    machines
    """
    env = TfEnv(env_name="CartPole-v1")

    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers,
        # each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()
示例#3
0
def run_task(*_):

    env = TfEnv(
        normalize(
            GridworldGathererEnv(
                plot={
                    'visitation': {
                        # 'save': '~/garage/data/local/gridworld/instant-run',
                        'save': False,
                        'live': True
                    }
                })))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=5000,
        max_path_length=100,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
    )

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as session:
        algo.train(sess=session)
示例#4
0
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with self.graph.as_default():
            env = TfEnv(DmControlEnv.from_suite(*task))

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=10,
                max_path_length=5,
                n_itr=1,
                discount=0.99,
                step_size=0.01,
            )
            algo.train()
            env.close()
示例#5
0
def run_task(*_):

    env = TfEnv(
        normalize(
            MinibotEnv(
                use_maps=[0, 1],  # 'all',  # [0,1]
                discretized=True)))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=5000,
                max_path_length=100,
                n_itr=15,
                discount=0.99,
                step_size=0.01,
                plot=plot,
                pause_for_plot=False)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as session:
        algo.train(sess=session)
def run_task(*_):
    env = TfEnv(
        normalize(
            OneHotMultiTaskEnv(task_env_cls=PR2ArmClockEnv,
                               task_args=TASK_ARGS,
                               task_kwargs=TASK_KWARGS)))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=400000000,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
示例#7
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the trpo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:import baselines.common.tf_util as U
    """
    ext.set_seed(seed)

    with tf.Graph().as_default():
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(32, 32),
                use_trust_region=True,
            ),
        )

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1024,
            max_path_length=100,
            n_itr=976,
            discount=0.99,
            gae_lambda=0.98,
            clip_range=0.1,
            policy_ent_coeff=0.0,
            plot=False,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(log_dir)

        algo.train()

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file
示例#8
0
    def test_trpo_pendulum(self):
        """Test TRPO with Pendulum environment."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            gae_lambda=0.98,
            policy_ent_coeff=0.0,
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 50

        env.close()
示例#9
0
def run_task(v):
    env = normalize(PointEnv())

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v['step_size'],
        # plot=True,
    )
    algo.train()
示例#10
0
def run_task(*_):
    env = TfEnv(normalize(gym.make("Pendulum-v0")))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.max_episode_steps,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
def run_task(*_):
    env = TfEnv(normalize(gym.make('MountainCar-v0')))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.max_episode_steps,
        n_itr=150,
        discount=0.99,
        step_size=0.1,
        plot=True,
    )
    algo.train()
示例#12
0
def run_task(*_):
    env = FlatGoalEnv(SawyerPickEnv(), obs_keys=["state_observation"])
    env = TfEnv(normalize(env))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=500,
                n_itr=500,
                discount=0.99,
                step_size=0.01,
                plot=True)
    algo.train()
示例#13
0
def run_task(*_):
    """Wrap TRPO training task in the run_task function."""
    env = TfEnv(normalize(CartpoleEnv()))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=100,
                n_itr=100,
                discount=0.99,
                max_kl_step=0.01,
                plot=False)
    algo.train()
示例#14
0
def run_task(v):
    v = SimpleNamespace(**v)

    # Environment
    env = FlatTorqueReacher(
        fix_goal=True,
        fixed_goal=GOALS[0],
        reward_type="hand_distance",
        hand_distance_completion_bonus=0.,
        torque_limit_pct=0.2,
        indicator_threshold=0.03,
        velocity_penalty_coeff=0.01,
        action_scale=10.0,
        hide_goal_pos=True,
    )
    env = TfEnv(normalize(env))

    # Policy
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=(64, 32),
        init_std=v.policy_init_std,
    )

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        plot=True,
        #optimizer_args=dict(max_grad_norm=0.5)
    )
    algo.train()
示例#15
0
def run_task(*_):
    env = TfEnv(normalize(PointEnv(goal=(-1, 0))))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=100,
        discount=0.99,
        step_size=0.01,
        plot=False,
        force_batch_sampler=True,
    )
    algo.train()
示例#16
0
    def test_categorical_policies(self, policy_cls):
        env = TfEnv(normalize(gym.make("CartPole-v0")))

        policy = policy_cls(name="policy", env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
            plot=True,
            optimizer=ConjugateGradientOptimizer,
            optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                base_eps=1e-5)),
        )
        algo.train(sess=self.sess)
    def test_gaussian_policies(self, policy_cls):
        logger._tensorboard = TensorBoardOutput()
        env = TfEnv(normalize(CartpoleEnv()))

        policy = policy_cls(name="policy", env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
            plot=True,
            optimizer=ConjugateGradientOptimizer,
            optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                base_eps=1e-5)),
        )
        algo.train(sess=self.sess)
示例#18
0
def run_task(vv):

    env = TfEnv(normalize(gym.make('HalfCheetah-v1')))

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=(32, 32),
                               name="policy")

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=vv["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        # plot=True,
    )
    algo.train()
示例#19
0
import gym

from garage.baselines import LinearFeatureBaseline
from garage.experiment import run_experiment
from garage.tf.algos import TRPO
from garage.tf.envs import TfEnv
from garage.tf.policies import CategoricalMLPPolicy

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288
env = TfEnv(gym.make("CartPole-v0"))

policy = CategoricalMLPPolicy(name="policy",
                              env_spec=env.spec,
                              hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=200,
    n_itr=120,
    discount=0.99,
    max_kl_step=0.01,
)

run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1)
示例#20
0
from garage.misc.instrument import run_experiment

from garage.tf.algos import TRPO
from garage.tf.policies import GaussianMLPPolicy
from garage.tf.envs import TfEnv

from sandbox.embed2learn.envs.mujoco import PR2ArmEnv

env = TfEnv(normalize(PR2ArmEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(32, 32),
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=100,
    discount=0.99,
    step_size=0.01,
    plot=True,
)
algo.train()