def run_task(variant): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.algos.vpg import VPG from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.box2d.cartpole_env import CartpoleEnv from sandbox.rocky.tf.envs.base import TfEnv env_name = variant['Environment'] if env_name == 'Cartpole': env = TfEnv(CartpoleEnv()) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(100, 100)) baseline = LinearFeatureBaseline(env_spec=env.spec) algorithm = VPG( env=env, policy=policy, baseline=baseline, n_itr=100, start_itr=0, batch_size=1000, max_path_length=1000, discount=0.99, ) algorithm.train()
def experiment(variant, saveDir): initial_params_file = variant['initial_params_file'] goalIndex = variant['goalIndex'] init_step_size = variant['init_step_size'] baseEnv = SawyerPickPlace_finnMAMLEnv() env = TfEnv(NormalizedBoxEnv(baseEnv)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=None, load_policy=initial_params_file, baseline=baseline, batch_size=10000, # 2x max_path_length=150, n_itr=10, reset_arg=goalIndex, optimizer_args={ 'init_learning_rate': init_step_size, 'tf_optimizer_args': { 'learning_rate': 0.5 * init_step_size }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }) import os saveDir = variant['saveDir'] if os.path.isdir(saveDir) == False: os.mkdir(saveDir) logger.set_snapshot_dir(saveDir) logger.add_tabular_output(saveDir + 'progress.csv') algo.train()
def opt_vpg(env, baseline, policy, learning_rate=1e-5, batch_size=4000, **kwargs): # no idea what batch size, learning rate, etc. should be optimiser = FirstOrderOptimizer( tf_optimizer_cls=tf.train.AdamOptimizer, tf_optimizer_args=dict(learning_rate=learning_rate), # batch_size actually gets passed to BatchPolopt (parent of VPG) # instead of TF optimiser (makes sense, I guess) batch_size=None, max_epochs=1) return VPG(env=env, policy=policy, baseline=baseline, n_itr=int(1e9), optimizer=optimiser, batch_size=batch_size, **kwargs)
policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, load_policy=initial_params_file, baseline=baseline, batch_size=4000, # 2x max_path_length=100, n_itr=n_itr, optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.5*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer} ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=4,
from sandbox.rocky.tf.algos.vpg import VPG from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=4, discount=0.99, optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, ))) algo.train()
def get_algo(env, policy, es, qf, baseline, max_path_length, batch_size, replay_pool_size, discount, scale_reward, learning_rate, replacement_prob, policy_updates_ratio, step_size, gae_lambda, sample_backups, qprop_min_itr, qf_updates_ratio, qprop_use_qf_baseline, qprop_eta_option, algo_name, qf_learning_rate, n_itr, **kwargs): algo = None min_pool_size = 1000 qf_batch_size = 64 qf_baseline = None print('Creating algo=%s with n_itr=%d, max_path_length=%d...'%( algo_name, n_itr, max_path_length)) if algo_name in ['ddpg']: algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=qf_batch_size, max_path_length=max_path_length, epoch_length=batch_size, # make comparable to batchopt methods min_pool_size=min_pool_size, replay_pool_size=replay_pool_size, n_epochs=n_itr, discount=discount, scale_reward=scale_reward, qf_learning_rate=qf_learning_rate, policy_learning_rate=learning_rate, replacement_prob=replacement_prob, policy_updates_ratio=policy_updates_ratio, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) elif algo_name in ['trpo', 'qprop']: if qf is not None: qf_baseline = QfunctionBaseline(env_spec=env.spec, policy=policy, qf=qf) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, discount=discount, step_size=step_size, gae_lambda=gae_lambda, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, sample_backups=sample_backups, qf=qf, qf_batch_size=qf_batch_size, min_pool_size=min_pool_size, scale_reward=scale_reward, qprop_min_itr=qprop_min_itr, qf_updates_ratio=qf_updates_ratio, qprop_eta_option=qprop_eta_option, replay_pool_size=replay_pool_size, replacement_prob=replacement_prob, qf_baseline=qf_baseline, qf_learning_rate=qf_learning_rate, qprop_use_qf_baseline=qprop_use_qf_baseline, ) elif algo_name in ['vpg', 'qvpg']: if qf is not None: qf_baseline = QfunctionBaseline(env_spec=env.spec, policy=policy, qf=qf) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, discount=discount, gae_lambda=gae_lambda, optimizer_args=dict( tf_optimizer_args=dict( learning_rate=learning_rate, ) ), qf=qf, qf_batch_size=qf_batch_size, min_pool_size=min_pool_size, scale_reward=scale_reward, qprop_min_itr=qprop_min_itr, qf_updates_ratio=qf_updates_ratio, qprop_eta_option=qprop_eta_option, replay_pool_size=replay_pool_size, qf_baseline=qf_baseline, qf_learning_rate=qf_learning_rate, qprop_use_qf_baseline=qprop_use_qf_baseline, ) return algo
if initial_params_file is not None: policy = None make_video1 = True if goalnum in [0, 1, 2] else False baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, load_policy=initial_params_file, baseline=baseline, batch_size=2000, max_path_length=100, n_itr=n_itr, #step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=goal, optimizer=None, optimizer_args={ 'init_learning_rate': step_size, 'tf_optimizer_args': { 'learning_rate': 0.5 * step_size }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }, # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 make_video=make_video1) exp_name = 'mamlil' + desc + str(run_id) + "_n_itr" + str( n_itr) + "_goal" + str(goalnum) run_experiment_lite(
for goal in goals: goal = list(goal) env = TfEnv(normalize(RandomBanditEnv(k=args.k, n=args.n, goal=goal))) baseline = LinearFeatureBaseline(env_spec=env.spec) if not args.path: raise Exception("Please enter a valid path for the parameter file") algo = VPG(env=env, policy=None, load_policy=args.path, baseline=baseline, batch_size=args.n * args.num_goals, max_path_length=args.n, n_itr=n_itr, optimizer_args={ 'init_learning_rate': args.step_size, 'tf_optimizer_args': { 'learning_rate': 0.5 * args.step_size }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, snapshot_mode="last", seed=4, exp_prefix='trpobandit_test', exp_name='test',
def get_algo(env, policy, es, qf, baseline, max_path_length, batch_size, replay_pool_size, discount, scale_reward, learning_rate, replacement_prob, policy_updates_ratio, step_size, gae_lambda, sample_backups, kl_sample_backups, qprop_eta_option, qprop_unbias, qprop_nu, algo_name, n_itr, recurrent, updates_ratio, policy_use_target, policy_batch_size, policy_sample_last, ac_delta, ac_sample_backups, save_freq, restore_auto, qf_learning_rate, qf_use_target, qf_mc_ratio, qf_batch_size, qf_residual_phi, **kwargs): algo = None algo_class = None min_pool_size = 1000 qf_baseline = None extra_kwargs = dict() print('Creating algo=%s with n_itr=%d, max_path_length=%d...' % (algo_name, n_itr, max_path_length)) if algo_name in [ 'ddpg', 'dspg', 'dspgoff', 'dqn', 'dsqn', 'trpg', 'trpgoff', ]: if algo_name in [ 'trpg', ]: extra_kwargs['policy_update_method'] = 'cg' algo = DDPG( env=env, policy=policy, policy_use_target=policy_use_target, es=es, qf=qf, qf_use_target=qf_use_target, policy_batch_size=policy_batch_size, qf_batch_size=qf_batch_size, qf_mc_ratio=qf_mc_ratio, qf_residual_phi=qf_residual_phi, max_path_length=max_path_length, epoch_length=batch_size, # make comparable to batchopt methods min_pool_size=min_pool_size, replay_pool_size=replay_pool_size, n_epochs=n_itr, discount=discount, scale_reward=scale_reward, qf_learning_rate=qf_learning_rate, policy_learning_rate=learning_rate, policy_step_size=step_size, policy_sample_last=policy_sample_last, replacement_prob=replacement_prob, policy_updates_ratio=policy_updates_ratio, updates_ratio=updates_ratio, save_freq=save_freq, restore_auto=restore_auto, **extra_kwargs, ) algo_class = 'DDPG' elif algo_name in [ 'trpo', 'nuqprop', 'nuqfqprop', 'actrpo', 'acqftrpo', 'qprop', 'mqprop', 'qfqprop', 'nafqprop', ]: if recurrent: extra_kwargs['optimizer'] = \ ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) if algo_name in [ 'actrpo', 'acqftrpo', ]: extra_kwargs['ac_delta'] = ac_delta extra_kwargs['qprop'] = False # disable qprop if ac_delta == 0: qf = None if algo_name in [ 'mqprop', ]: extra_kwargs['mqprop'] = True if algo_name in [ 'nuqprop', 'nuqfqprop', ]: extra_kwargs['qprop_nu'] = qprop_nu if qf is not None: qf_baseline = QfunctionBaseline(env_spec=env.spec, policy=policy, qf=qf) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, discount=discount, step_size=step_size, gae_lambda=gae_lambda, sample_backups=sample_backups, kl_sample_backups=kl_sample_backups, qf=qf, qf_use_target=qf_use_target, qf_batch_size=qf_batch_size, qf_mc_ratio=qf_mc_ratio, qf_residual_phi=qf_residual_phi, min_pool_size=min_pool_size, scale_reward=scale_reward, qf_updates_ratio=updates_ratio, qprop_eta_option=qprop_eta_option, qprop_unbias=qprop_unbias, replay_pool_size=replay_pool_size, replacement_prob=replacement_prob, qf_baseline=qf_baseline, qf_learning_rate=qf_learning_rate, ac_sample_backups=ac_sample_backups, policy_sample_last=policy_sample_last, save_freq=save_freq, restore_auto=restore_auto, **extra_kwargs) algo_class = 'TRPO' elif algo_name in [ 'vpg', 'qvpg', ]: if qf is not None: qf_baseline = QfunctionBaseline(env_spec=env.spec, policy=policy, qf=qf) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, discount=discount, gae_lambda=gae_lambda, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=learning_rate, )), qf=qf, qf_use_target=qf_use_target, qf_batch_size=qf_batch_size, qf_mc_ratio=qf_mc_ratio, qf_residual_phi=qf_residual_phi, min_pool_size=min_pool_size, scale_reward=scale_reward, qf_updates_ratio=updates_ratio, qprop_eta_option=qprop_eta_option, qprop_unbias=qprop_unbias, replay_pool_size=replay_pool_size, qf_baseline=qf_baseline, qf_learning_rate=qf_learning_rate, save_freq=save_freq, restore_auto=restore_auto, ) algo_class = 'VPG' print('[get_algo] Instantiating %s.' % algo_class) return algo
for goal in range(10, 20): stub(globals()) env = TfEnv(SawyerPickEnv(goal_idx=goal)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=None, load_policy=initial_params_file, baseline=baseline, batch_size=10000, # 2x max_path_length=100, n_itr=10, optimizer_args={ 'init_learning_rate': 0.1, 'tf_optimizer_args': { 'learning_rate': 0.01 }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }, reset_arg=goal, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="all",
from sandbox.rocky.tf.envs.base import TfEnv stub(globals()) #env = TfEnv(normalize(PointEnv())) env = TfEnv(normalize(PointEnvRandGoal())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, ) #baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = ZeroBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, #batch_size=20, max_path_length=5, n_itr=100, #plot=True, ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", seed=1, exp_prefix='deleteme', exp_name='deleteme', #plot=True, )
from examples.point_env_randgoal import PointEnvRandGoal from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy #from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv stub(globals()) env = TfEnv(normalize(PointEnv())) #env = TfEnv(normalize(PointEnvRandGoal())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, #plot=True, ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", seed=1, #plot=True, )
def experiment(variant): seed = variant['seed'] tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) initial_params_file = variant['initial_params_file'] goalIndex = variant['goalIndex'] init_step_size = variant['init_step_size'] regionSize = variant['regionSize'] mode = variant['mode'] if 'docker' in mode: taskFilePrefix = '/root/code' else: taskFilePrefix = '/home/russellm' if variant['valRegionSize'] != None: valRegionSize = variant['valRegionSize'] tasksFile = taskFilePrefix + '/multiworld/multiworld/envs/goals/pickPlace_' + valRegionSize + '_val.pkl' else: tasksFile = taskFilePrefix + '/multiworld/multiworld/envs/goals/pickPlace_' + regionSize + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) envType = variant['envType'] if envType == 'Push': baseEnv = SawyerPushEnv(tasks=tasks) else: assert (envType) == 'PickPlace' baseEnv = SawyerPickPlaceEnv(tasks=tasks) env = FinnMamlEnv( FlatGoalEnv(baseEnv, obs_keys=['state_observation', 'state_desired_goal'])) env = TfEnv(NormalizedBoxEnv(env)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=None, load_policy=initial_params_file, baseline=baseline, batch_size=7500, # 2x max_path_length=150, n_itr=10, reset_arg=goalIndex, optimizer_args={ 'init_learning_rate': init_step_size, 'tf_optimizer_args': { 'learning_rate': 0.1 * init_step_size }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }) import os saveDir = variant['saveDir'] currPath = '' for _dir in saveDir.split('/'): currPath += _dir + '/' if os.path.isdir(currPath) == False: os.mkdir(currPath) logger.set_snapshot_dir(saveDir) logger.add_tabular_output(saveDir + 'progress.csv') algo.train()
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=40, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict( learning_rate=0.01, ) ) ) algo.train()
batch_size = 200 else: env = TfEnv(normalize(SwimmerEnv())) batch_size = 20 policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(100,100), ) baseline = LinearFeatureBaseline(env_spec=env.spec) #baseline = ZeroBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=500*batch_size, max_path_length=500, n_itr=500, #plot=True, optimizer_args={'tf_optimizer_args':{'learning_rate': 1e-3}}, ) run_experiment_lite( algo.train(), n_parallel=1, # try increasing this to make it faster??? (Maybe need to modify code for this) snapshot_mode="last", seed=1, exp_prefix='vpgswimmer', #exp_name='basic', exp_name='randomenv', #plot=True, )