def test_param_space_noise(self):
        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(name="policy33",
                                               env_spec=env.spec,
                                               hidden_sizes=(16, 16),
                                               hidden_nonlinearity=tf.nn.tanh,
                                               param_noise_std=0.0)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        import rllab.misc.logger as logger

        logger.set_snapshot_dir('/tmp/')
        logger.set_snapshot_mode('last')

        algo = MAMLTRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=3,
            max_path_length=10,
            meta_batch_size=4,
            num_grad_updates=1,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
        )
        algo.train()

        tf.reset_default_graph()
        pkl_file = os.path.join('/tmp/', 'params.pkl')
        with tf.Session() as sess:
            data = joblib.load(pkl_file)
            policy = data['policy']
            action_1 = policy.get_action(obs)[1]['mean']
            action_2 = policy.get_action(obs)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)

            self.assertAlmostEquals(diff, 0.0)

            action_1 = policy.get_action(obs, param_noise_std=1.0)[1]['mean']
            action_2 = policy.get_action(obs, param_noise_std=1.0)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)

            self.assertGreaterEqual(diff, 0.1)

            policy.param_noise_std = 1.0
            action_1 = policy.get_action(obs)[1]['mean']
            action_2 = policy.get_action(obs)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)
            self.assertGreaterEqual(diff, 0.1)
    def test_serialization(self):

        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy56",
            env_spec=env.spec,
            hidden_sizes=(16, 16),
            hidden_nonlinearity=tf.nn.tanh,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        import rllab.misc.logger as logger

        logger.set_snapshot_dir('/tmp/')
        logger.set_snapshot_mode('last')

        algo = MAMLTRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2,
            max_path_length=10,
            meta_batch_size=4,
            num_grad_updates=1,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
        )
        algo.train()

        tf.reset_default_graph()
        pkl_file = os.path.join('/tmp/', 'params.pkl')
        with tf.Session() as sess:
            data = joblib.load(pkl_file)
            policy = data['policy']
            action_before = policy.get_action(obs)[1]['mean']

            dump_string = pickle.dumps(policy)

        tf.reset_default_graph()
        with tf.Session() as sess:
            policy_loaded = pickle.loads(dump_string)
            action_after = policy_loaded.get_action(obs)[1]['mean']

        diff = np.sum(np.abs(action_before - action_after))
        self.assertAlmostEquals(diff, 0.0, places=3)
    def test_get_mean_stepsize(self):

        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(name="policy2",
                                               env_spec=env.spec,
                                               hidden_sizes=(16, 16),
                                               hidden_nonlinearity=tf.nn.tanh,
                                               trainable_step_size=True,
                                               grad_step_size=0.7)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            mean_stepsize_1 = policy.get_mean_step_size()

        self.assertAlmostEquals(mean_stepsize_1, 0.7, places=5)
exp_names = [gen_name + name for name in names]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)),
                                       initial_params_files):
    avg_returns = []
    for goal in goals:
        goal = list(goal)

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(PointEnvRandGoalOracle(goal=goal))
            n_itr = 1
        else:
            env = normalize(PointEnvRandGoal(goal=goal))
            n_itr = 5
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = VPG(
            env=env,
            policy=policy,
            load_policy=initial_params_file,
            baseline=baseline,
示例#5
0
from sandbox_maml.rocky.tf.algos.trpo import TRPO
from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab_maml.envs.box2d.cartpole_env import CartpoleEnv
from rllab_maml.envs.normalized_env import normalize
from sandbox_maml.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy
from sandbox_maml.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
from sandbox_maml.rocky.tf.envs.base import TfEnv
import sandbox_maml.rocky.tf.core.layers as L
from sandbox_maml.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
from rllab_maml.misc.instrument import stub, run_experiment_lite

stub(globals())

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianLSTMPolicy(
    name="policy",
    env_spec=env.spec,
    lstm_layer_cls=L.TfBasicLSTMLayer,
    # gru_layer_cls=L.GRULayer,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=10,

# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml=True

for v in variants:
    direc = v['direc']
    learning_rate = v['meta_step_size']

    if direc:
        env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
    else:
        env = TfEnv(normalize(HalfCheetahEnvRand()))
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        return [2]


# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml=True

for v in variants:
    task_var = v['task_var']

    if task_var == 0:
        env = TfEnv(normalize(AntEnvRandDirec()))
        task_var = 'direc'
    elif task_var == 1:
        env = TfEnv(normalize(AntEnvRand()))
        task_var = 'vel'
    elif task_var == 2:
        env = TfEnv(normalize(AntEnvRandGoal()))
        task_var = 'pos'
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
    )
示例#8
0
baselines = ['linear']
fast_batch_size = 20  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
meta_batch_size = 40  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
max_path_length = 100
num_grad_updates = 1
meta_step_size = 0.01

use_maml = True
interpreter_path = sys.executable

for fast_learning_rate in fast_learning_rates:
    for learning_rate in learning_rates:
        for bas in baselines:
            stub(globals())

            env = TfEnv(normalize(PointEnvRandGoal()))
            policy = MAMLGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100, 100),
            )
            if bas == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            elif 'linear' in bas:
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            else:
                baseline = GaussianMLPBaseline(env_spec=env.spec)
            algo = MAMLTRPO(
                env=env,
fast_learning_rates = [0.1]
baselines = ['linear']
fast_batch_size = 20
meta_batch_size = 60
max_path_length = 10
num_grad_updates = 1
meta_step_size = 0.01

use_maml = True

for fast_learning_rate in fast_learning_rates:
    for bas in baselines:
        stub(globals())

        env = TfEnv(normalize(GridWorldEnvRand('four-state')))
        policy = MAMLCategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=fast_learning_rate,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100,100),
        )
        if bas == 'zero':
            baseline = ZeroBaseline(env_spec=env.spec)
        elif 'linear' in bas:
            baseline = LinearFeatureBaseline(env_spec=env.spec)
        else:
            baseline = GaussianMLPBaseline(env_spec=env.spec)
        algo = MAMLTRPO(
            env=env,
示例#10
0
variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml = True

for v in variants:
    task_var = v['task_var']
    oracle = v['oracle']

    if task_var == 0:
        task_var = 'direc'
        exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvDirecOracle()))
        else:
            env = TfEnv(normalize(AntEnvRandDirec()))
    elif task_var == 1:
        task_var = 'vel'
        exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvOracle()))
        else:
            env = TfEnv(normalize(AntEnvRand()))
    elif task_var == 2:
        task_var = 'pos'
        exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvRandGoalOracle()))
        else: