def run_task(variant):
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.algos.vpg import VPG
    from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from sandbox.rocky.tf.envs.base import TfEnv

    env_name = variant['Environment']
    if env_name == 'Cartpole':
        env = TfEnv(CartpoleEnv())
    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(100, 100))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algorithm = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        n_itr=100,
        start_itr=0,
        batch_size=1000,
        max_path_length=1000,
        discount=0.99,
    )
    algorithm.train()
예제 #2
0
def experiment(variant, saveDir):

    initial_params_file = variant['initial_params_file']

    goalIndex = variant['goalIndex']

    init_step_size = variant['init_step_size']

    baseEnv = SawyerPickPlace_finnMAMLEnv()
    env = TfEnv(NormalizedBoxEnv(baseEnv))
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=None,
        load_policy=initial_params_file,
        baseline=baseline,
        batch_size=10000,  # 2x
        max_path_length=150,
        n_itr=10,
        reset_arg=goalIndex,
        optimizer_args={
            'init_learning_rate': init_step_size,
            'tf_optimizer_args': {
                'learning_rate': 0.5 * init_step_size
            },
            'tf_optimizer_cls': tf.train.GradientDescentOptimizer
        })

    import os

    saveDir = variant['saveDir']

    if os.path.isdir(saveDir) == False:
        os.mkdir(saveDir)

    logger.set_snapshot_dir(saveDir)
    logger.add_tabular_output(saveDir + 'progress.csv')

    algo.train()
예제 #3
0
def opt_vpg(env,
            baseline,
            policy,
            learning_rate=1e-5,
            batch_size=4000,
            **kwargs):
    # no idea what batch size, learning rate, etc. should be
    optimiser = FirstOrderOptimizer(
        tf_optimizer_cls=tf.train.AdamOptimizer,
        tf_optimizer_args=dict(learning_rate=learning_rate),
        # batch_size actually gets passed to BatchPolopt (parent of VPG)
        # instead of TF optimiser (makes sense, I guess)
        batch_size=None,
        max_epochs=1)
    return VPG(env=env,
               policy=policy,
               baseline=baseline,
               n_itr=int(1e9),
               optimizer=optimiser,
               batch_size=batch_size,
               **kwargs)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_sizes=(100, 100),
        )


        if initial_params_file is not None:
            policy = None

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = VPG(
            env=env,
            policy=policy,
            load_policy=initial_params_file,
            baseline=baseline,
            batch_size=4000,  # 2x
            max_path_length=100,
            n_itr=n_itr,
            optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.5*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer}
        )


        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            seed=4,
예제 #5
0
from sandbox.rocky.tf.algos.vpg import VPG
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(env=env,
           policy=policy,
           baseline=baseline,
           batch_size=10000,
           max_path_length=100,
           n_itr=4,
           discount=0.99,
           optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, )))
algo.train()
예제 #6
0
def get_algo(env, policy, es, qf, baseline, max_path_length,
        batch_size, replay_pool_size, discount,
        scale_reward, learning_rate, replacement_prob,
        policy_updates_ratio,
        step_size, gae_lambda,
        sample_backups,
        qprop_min_itr,
        qf_updates_ratio,
        qprop_use_qf_baseline,
        qprop_eta_option,
        algo_name,
        qf_learning_rate,
        n_itr,
        **kwargs):
    algo = None
    min_pool_size = 1000
    qf_batch_size = 64
    qf_baseline = None

    print('Creating algo=%s with n_itr=%d, max_path_length=%d...'%(
        algo_name, n_itr, max_path_length))

    if algo_name in ['ddpg']:
        algo = DDPG(
            env=env,
            policy=policy,
            es=es,
            qf=qf,
            batch_size=qf_batch_size,
            max_path_length=max_path_length,
            epoch_length=batch_size, # make comparable to batchopt methods
            min_pool_size=min_pool_size,
            replay_pool_size=replay_pool_size,
            n_epochs=n_itr,
            discount=discount,
            scale_reward=scale_reward,
            qf_learning_rate=qf_learning_rate,
            policy_learning_rate=learning_rate,
            replacement_prob=replacement_prob,
            policy_updates_ratio=policy_updates_ratio,
            # Uncomment both lines (this and the plot parameter below) to enable plotting
            # plot=True,
        )
    elif algo_name in ['trpo', 'qprop']:
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                policy=policy, qf=qf)
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            discount=discount,
            step_size=step_size,
            gae_lambda=gae_lambda,
            # Uncomment both lines (this and the plot parameter below) to enable plotting
            # plot=True,
            sample_backups=sample_backups,
            qf=qf,
            qf_batch_size=qf_batch_size,
            min_pool_size=min_pool_size,
            scale_reward=scale_reward,
            qprop_min_itr=qprop_min_itr,
            qf_updates_ratio=qf_updates_ratio,
            qprop_eta_option=qprop_eta_option,
            replay_pool_size=replay_pool_size,
            replacement_prob=replacement_prob,
            qf_baseline=qf_baseline,
            qf_learning_rate=qf_learning_rate,
            qprop_use_qf_baseline=qprop_use_qf_baseline,
        )
    elif algo_name in ['vpg', 'qvpg']:
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                policy=policy, qf=qf)
        algo = VPG(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            discount=discount,
            gae_lambda=gae_lambda,
            optimizer_args=dict(
                tf_optimizer_args=dict(
                    learning_rate=learning_rate,
                )
            ),
            qf=qf,
            qf_batch_size=qf_batch_size,
            min_pool_size=min_pool_size,
            scale_reward=scale_reward,
            qprop_min_itr=qprop_min_itr,
            qf_updates_ratio=qf_updates_ratio,
            qprop_eta_option=qprop_eta_option,
            replay_pool_size=replay_pool_size,
            qf_baseline=qf_baseline,
            qf_learning_rate=qf_learning_rate,
            qprop_use_qf_baseline=qprop_use_qf_baseline,
        )
    return algo
예제 #7
0
            if initial_params_file is not None:
                policy = None
            make_video1 = True if goalnum in [0, 1, 2] else False
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = VPG(
                env=env,
                policy=policy,
                load_policy=initial_params_file,
                baseline=baseline,
                batch_size=2000,
                max_path_length=100,
                n_itr=n_itr,
                #step_size=10.0,
                sampler_cls=VectorizedSampler,  # added by RK 6/19
                sampler_args=dict(n_envs=1),
                reset_arg=goal,
                optimizer=None,
                optimizer_args={
                    'init_learning_rate': step_size,
                    'tf_optimizer_args': {
                        'learning_rate': 0.5 * step_size
                    },
                    'tf_optimizer_cls': tf.train.GradientDescentOptimizer
                },
                # extra_input="onehot_exploration", # added by RK 6/19
                # extra_input_dim=5, # added by RK 6/19
                make_video=make_video1)
            exp_name = 'mamlil' + desc + str(run_id) + "_n_itr" + str(
                n_itr) + "_goal" + str(goalnum)

            run_experiment_lite(
예제 #8
0
for goal in goals:
    goal = list(goal)

    env = TfEnv(normalize(RandomBanditEnv(k=args.k, n=args.n, goal=goal)))
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    if not args.path:
        raise Exception("Please enter a valid path for the parameter file")

    algo = VPG(env=env,
               policy=None,
               load_policy=args.path,
               baseline=baseline,
               batch_size=args.n * args.num_goals,
               max_path_length=args.n,
               n_itr=n_itr,
               optimizer_args={
                   'init_learning_rate': args.step_size,
                   'tf_optimizer_args': {
                       'learning_rate': 0.5 * args.step_size
                   },
                   'tf_optimizer_cls': tf.train.GradientDescentOptimizer
               })

    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        snapshot_mode="last",
        seed=4,
        exp_prefix='trpobandit_test',
        exp_name='test',
예제 #9
0
def get_algo(env, policy, es, qf, baseline, max_path_length, batch_size,
             replay_pool_size, discount, scale_reward, learning_rate,
             replacement_prob, policy_updates_ratio, step_size, gae_lambda,
             sample_backups, kl_sample_backups, qprop_eta_option, qprop_unbias,
             qprop_nu, algo_name, n_itr, recurrent, updates_ratio,
             policy_use_target, policy_batch_size, policy_sample_last,
             ac_delta, ac_sample_backups, save_freq, restore_auto,
             qf_learning_rate, qf_use_target, qf_mc_ratio, qf_batch_size,
             qf_residual_phi, **kwargs):
    algo = None
    algo_class = None
    min_pool_size = 1000
    qf_baseline = None
    extra_kwargs = dict()

    print('Creating algo=%s with n_itr=%d, max_path_length=%d...' %
          (algo_name, n_itr, max_path_length))
    if algo_name in [
            'ddpg',
            'dspg',
            'dspgoff',
            'dqn',
            'dsqn',
            'trpg',
            'trpgoff',
    ]:
        if algo_name in [
                'trpg',
        ]:
            extra_kwargs['policy_update_method'] = 'cg'
        algo = DDPG(
            env=env,
            policy=policy,
            policy_use_target=policy_use_target,
            es=es,
            qf=qf,
            qf_use_target=qf_use_target,
            policy_batch_size=policy_batch_size,
            qf_batch_size=qf_batch_size,
            qf_mc_ratio=qf_mc_ratio,
            qf_residual_phi=qf_residual_phi,
            max_path_length=max_path_length,
            epoch_length=batch_size,  # make comparable to batchopt methods
            min_pool_size=min_pool_size,
            replay_pool_size=replay_pool_size,
            n_epochs=n_itr,
            discount=discount,
            scale_reward=scale_reward,
            qf_learning_rate=qf_learning_rate,
            policy_learning_rate=learning_rate,
            policy_step_size=step_size,
            policy_sample_last=policy_sample_last,
            replacement_prob=replacement_prob,
            policy_updates_ratio=policy_updates_ratio,
            updates_ratio=updates_ratio,
            save_freq=save_freq,
            restore_auto=restore_auto,
            **extra_kwargs,
        )
        algo_class = 'DDPG'
    elif algo_name in [
            'trpo',
            'nuqprop',
            'nuqfqprop',
            'actrpo',
            'acqftrpo',
            'qprop',
            'mqprop',
            'qfqprop',
            'nafqprop',
    ]:
        if recurrent:
            extra_kwargs['optimizer'] = \
                ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
        if algo_name in [
                'actrpo',
                'acqftrpo',
        ]:
            extra_kwargs['ac_delta'] = ac_delta
            extra_kwargs['qprop'] = False  # disable qprop
            if ac_delta == 0: qf = None
        if algo_name in [
                'mqprop',
        ]:
            extra_kwargs['mqprop'] = True
        if algo_name in [
                'nuqprop',
                'nuqfqprop',
        ]:
            extra_kwargs['qprop_nu'] = qprop_nu
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                                            policy=policy,
                                            qf=qf)
        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    max_path_length=max_path_length,
                    n_itr=n_itr,
                    discount=discount,
                    step_size=step_size,
                    gae_lambda=gae_lambda,
                    sample_backups=sample_backups,
                    kl_sample_backups=kl_sample_backups,
                    qf=qf,
                    qf_use_target=qf_use_target,
                    qf_batch_size=qf_batch_size,
                    qf_mc_ratio=qf_mc_ratio,
                    qf_residual_phi=qf_residual_phi,
                    min_pool_size=min_pool_size,
                    scale_reward=scale_reward,
                    qf_updates_ratio=updates_ratio,
                    qprop_eta_option=qprop_eta_option,
                    qprop_unbias=qprop_unbias,
                    replay_pool_size=replay_pool_size,
                    replacement_prob=replacement_prob,
                    qf_baseline=qf_baseline,
                    qf_learning_rate=qf_learning_rate,
                    ac_sample_backups=ac_sample_backups,
                    policy_sample_last=policy_sample_last,
                    save_freq=save_freq,
                    restore_auto=restore_auto,
                    **extra_kwargs)
        algo_class = 'TRPO'
    elif algo_name in [
            'vpg',
            'qvpg',
    ]:
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                                            policy=policy,
                                            qf=qf)
        algo = VPG(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            discount=discount,
            gae_lambda=gae_lambda,
            optimizer_args=dict(
                tf_optimizer_args=dict(learning_rate=learning_rate, )),
            qf=qf,
            qf_use_target=qf_use_target,
            qf_batch_size=qf_batch_size,
            qf_mc_ratio=qf_mc_ratio,
            qf_residual_phi=qf_residual_phi,
            min_pool_size=min_pool_size,
            scale_reward=scale_reward,
            qf_updates_ratio=updates_ratio,
            qprop_eta_option=qprop_eta_option,
            qprop_unbias=qprop_unbias,
            replay_pool_size=replay_pool_size,
            qf_baseline=qf_baseline,
            qf_learning_rate=qf_learning_rate,
            save_freq=save_freq,
            restore_auto=restore_auto,
        )
        algo_class = 'VPG'
    print('[get_algo] Instantiating %s.' % algo_class)
    return algo
예제 #10
0
for goal in range(10, 20):

    stub(globals())
    env = TfEnv(SawyerPickEnv(goal_idx=goal))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = VPG(
        env=env,
        policy=None,
        load_policy=initial_params_file,
        baseline=baseline,
        batch_size=10000,  # 2x
        max_path_length=100,
        n_itr=10,
        optimizer_args={
            'init_learning_rate': 0.1,
            'tf_optimizer_args': {
                'learning_rate': 0.01
            },
            'tf_optimizer_cls': tf.train.GradientDescentOptimizer
        },
        reset_arg=goal,
    )

    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="all",
예제 #11
0
from sandbox.rocky.tf.envs.base import TfEnv

stub(globals())

#env = TfEnv(normalize(PointEnv()))
env = TfEnv(normalize(PointEnvRandGoal()))
policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
)
#baseline = LinearFeatureBaseline(env_spec=env.spec)
baseline = ZeroBaseline(env_spec=env.spec)
algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    #batch_size=20,
    max_path_length=5,
    n_itr=100,
    #plot=True,
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    snapshot_mode="last",
    seed=1,
    exp_prefix='deleteme',
    exp_name='deleteme',
    #plot=True,
)
예제 #12
0
from examples.point_env_randgoal import PointEnvRandGoal
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
#from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
#from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv

stub(globals())

env = TfEnv(normalize(PointEnv()))
#env = TfEnv(normalize(PointEnvRandGoal()))
policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    #plot=True,
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    snapshot_mode="last",
    seed=1,
    #plot=True,
)
예제 #13
0
def experiment(variant):

    seed = variant['seed']
    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    initial_params_file = variant['initial_params_file']
    goalIndex = variant['goalIndex']
    init_step_size = variant['init_step_size']

    regionSize = variant['regionSize']

    mode = variant['mode']

    if 'docker' in mode:
        taskFilePrefix = '/root/code'
    else:
        taskFilePrefix = '/home/russellm'

    if variant['valRegionSize'] != None:
        valRegionSize = variant['valRegionSize']

        tasksFile = taskFilePrefix + '/multiworld/multiworld/envs/goals/pickPlace_' + valRegionSize + '_val.pkl'

    else:
        tasksFile = taskFilePrefix + '/multiworld/multiworld/envs/goals/pickPlace_' + regionSize + '.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))

    envType = variant['envType']
    if envType == 'Push':
        baseEnv = SawyerPushEnv(tasks=tasks)
    else:
        assert (envType) == 'PickPlace'
        baseEnv = SawyerPickPlaceEnv(tasks=tasks)

    env = FinnMamlEnv(
        FlatGoalEnv(baseEnv,
                    obs_keys=['state_observation', 'state_desired_goal']))
    env = TfEnv(NormalizedBoxEnv(env))
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=None,
        load_policy=initial_params_file,
        baseline=baseline,
        batch_size=7500,  # 2x
        max_path_length=150,
        n_itr=10,
        reset_arg=goalIndex,
        optimizer_args={
            'init_learning_rate': init_step_size,
            'tf_optimizer_args': {
                'learning_rate': 0.1 * init_step_size
            },
            'tf_optimizer_cls': tf.train.GradientDescentOptimizer
        })
    import os
    saveDir = variant['saveDir']
    currPath = ''
    for _dir in saveDir.split('/'):
        currPath += _dir + '/'
        if os.path.isdir(currPath) == False:
            os.mkdir(currPath)

    logger.set_snapshot_dir(saveDir)
    logger.add_tabular_output(saveDir + 'progress.csv')
    algo.train()
예제 #14
0
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=10000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
    optimizer_args=dict(
        tf_optimizer_args=dict(
            learning_rate=0.01,
        )
    )
)
algo.train()
예제 #15
0
    batch_size = 200
else:
    env = TfEnv(normalize(SwimmerEnv()))
    batch_size = 20
policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(100,100),
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
#baseline = ZeroBaseline(env_spec=env.spec)
algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=500*batch_size,
    max_path_length=500,
    n_itr=500,
    #plot=True,
    optimizer_args={'tf_optimizer_args':{'learning_rate': 1e-3}},
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,  # try increasing this to make it faster??? (Maybe need to modify code for this)
    snapshot_mode="last",
    seed=1,
    exp_prefix='vpgswimmer',
    #exp_name='basic',
    exp_name='randomenv',
    #plot=True,
)