def run_train_task(vv): env = TfEnv(normalize(vv['env']())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["meta_step_size"], parallel_sampler=vv['parallel_sampler'], ) algo.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env']( log_scale_limit=vv['log_scale_limit'], target_velocity=vv['target_velocity'], ))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], optimizer=vv['optimizer_model'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency'] ) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], num_tasks=vv['meta_batch_size'] ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv.get('reset_from_env_traj', False), max_path_length_env=vv['path_length_env'], max_path_length_dyn=vv.get('path_length_dyn', None), dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs', (500, 500)), discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv['retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), log_real_performance=True, clip_obs=vv.get('clip_obs', True), entropy_bonus=vv['entropy_bonus'], tailored_exploration=vv['tailored_exploration'] ) algo.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env']())) policy = vv['policy'](name="policy", env_spec=env.spec, num_tasks=vv['meta_batch_size'], hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_args = dict(max_epochs=vv['max_epochs'], ) algo = vv['algo']( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], entropy_bonus=vv['entropy_bonus'], clip_eps=vv['clip_eps'], target_inner_step=vv['target_inner_step'], init_kl_penalty=vv['init_kl_penalty'], optimizer_args=optimizer_args, ) algo.train()
def run_train_task(vv): import sys print(vv['exp_prefix']) sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'], vv['exp_name'], 'stdout.log') sysout_log_file = open(sysout_log_path, 'w') sys.stdout = sysout_log_file env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], optimizer=vv['optimizer_model'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency']) policy = MAMLImprovedGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], param_noise_std=vv['param_noise_std']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv.get('reset_from_env_traj', False), max_path_length_env=vv['path_length_env'], max_path_length_dyn=vv.get('path_length_dyn', None), discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), clip_obs=vv.get('clip_obs', True)) algo.train() sysout_log_file.close()
def run_train_task(vv): env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = BadDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], optimizer=vv['optimizer_model'], output_bias_range=vv['output_bias_range'], gaussian_noise_output_std=vv['output_noise_std'], ) policy = MAMLImprovedGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], param_noise_std=vv['param_noise_std']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], dynamic_model_epochs=vv['dynamic_model_epochs'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv['reset_from_env_traj'], max_path_length_env=vv['path_length_env'], discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), log_real_performance=True, resample_output_bias=vv['resample_output_bias']) algo.train()
def run_train_task(vv): env = TfEnv( normalize(vv['env_class']( fix_goal=vv['fix_goal'], reward_type=vv['reward_type'], init_puck_low=INIT_PUCK_TARGET - vv['init_slack'], init_puck_high=INIT_PUCK_TARGET + vv['init_slack'], puck_goal_low=PUCK_GOAL_TARGET - vv['goal_slack'], puck_goal_high=PUCK_GOAL_TARGET + vv['goal_slack'], ))) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, num_tasks=vv['meta_batch_size'], hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_args = dict( max_epochs=vv['max_epochs'], batch_size=vv['num_batches'], tf_optimizer_args=dict(learning_rate=vv['outer_lr']), ) algo = MAMLPPO( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], entropy_bonus=vv['entropy_bonus'], clip_eps=vv['clip_eps'], clip_outer=vv['clip_outer'], target_outer_step=vv['target_outer_step'], target_inner_step=vv['target_inner_step'], init_outer_kl_penalty=vv['init_outer_kl_penalty'], init_inner_kl_penalty=vv['init_inner_kl_penalty'], adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'], parallel_sampler=vv['parallel_sampler'], optimizer_args=optimizer_args, ) algo.train()
def test_param_space_noise(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy(name="policy33", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, param_noise_std=0.0) baseline = LinearFeatureBaseline(env_spec=env.spec) import rllab.misc.logger as logger logger.set_snapshot_dir('/tmp/') logger.set_snapshot_mode('last') algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=3, max_path_length=10, meta_batch_size=4, num_grad_updates=1, n_itr=1, discount=0.99, step_size=0.01, ) algo.train() tf.reset_default_graph() pkl_file = os.path.join('/tmp/', 'params.pkl') with tf.Session() as sess: data = joblib.load(pkl_file) policy = data['policy'] action_1 = policy.get_action(obs)[1]['mean'] action_2 = policy.get_action(obs)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertAlmostEquals(diff, 0.0) action_1 = policy.get_action(obs, param_noise_std=1.0)[1]['mean'] action_2 = policy.get_action(obs, param_noise_std=1.0)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertGreaterEqual(diff, 0.1) policy.param_noise_std = 1.0 action_1 = policy.get_action(obs)[1]['mean'] action_2 = policy.get_action(obs)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertGreaterEqual(diff, 0.1)
def test_serialization(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy( name="policy56", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) import rllab.misc.logger as logger logger.set_snapshot_dir('/tmp/') logger.set_snapshot_mode('last') algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=2, max_path_length=10, meta_batch_size=4, num_grad_updates=1, n_itr=1, discount=0.99, step_size=0.01, ) algo.train() tf.reset_default_graph() pkl_file = os.path.join('/tmp/', 'params.pkl') with tf.Session() as sess: data = joblib.load(pkl_file) policy = data['policy'] action_before = policy.get_action(obs)[1]['mean'] dump_string = pickle.dumps(policy) tf.reset_default_graph() with tf.Session() as sess: policy_loaded = pickle.loads(dump_string) action_after = policy_loaded.get_action(obs)[1]['mean'] diff = np.sum(np.abs(action_before - action_after)) self.assertAlmostEquals(diff, 0.0, places=3)
def run_train_task(vv): env = TfEnv(normalize(vv['env']())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, num_tasks=vv['meta_batch_size'], hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLPPO( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], entropy_bonus=vv['entropy_bonus'], clip_eps=vv['clip_eps'], clip_outer=vv['clip_outer'], target_outer_step=vv['target_outer_step'], target_inner_step=vv['target_inner_step'], init_outer_kl_penalty=vv['init_outer_kl_penalty'], init_inner_kl_penalty=vv['init_inner_kl_penalty'], adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'], max_epochs=vv['max_epochs'], num_batches=vv['num_batches'], tf_optimizer_args=dict(learning_rate=vv['outer_lr']), parallel_sampler=vv['parallel_sampler'], multi_adam=vv['multi_adam'], ) algo.train()
env = normalize(PointEnvRandGoalOracle(goal=goal)) n_itr = 1 else: env = normalize(PointEnvRandGoal(goal=goal)) n_itr = 5 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, load_policy=initial_params_file, baseline=baseline, batch_size=4000, # 2x max_path_length=100, n_itr=n_itr, optimizer_args={ 'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': { 'learning_rate': 0.5 * step_sizes[step_i] }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer })
def run_train_task(vv): env = TfEnv( normalize(vv['env'](init_sampling_boundaries=vv['point_env_setup'] ['init_sampling_boundaries'], goal=vv['point_env_setup']['goal']))) dynamics_model = PointEnvFakeModelEnsemble( env_spec=env.spec, num_models=vv['num_models'], error_range_around_goal=vv['fake_model_setup'] ['error_range_around_goal'], bias_range=vv['fake_model_setup']['bias_range'], error_std=vv['fake_model_setup']['error_std'], goal=vv['point_env_setup']['goal'], error_at_goal=vv['fake_model_setup']['error_at_goal'], smooth_error=vv['smooth_error'], ) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], num_tasks=vv['meta_batch_size']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv.get('reset_from_env_traj', False), max_path_length_env=vv['path_length_env'], max_path_length_dyn=vv.get('path_length_dyn', None), dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs', (500, 500)), discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), log_real_performance=True, clip_obs=vv.get('clip_obs', True), entropy_bonus=vv['entropy_bonus'], ) algo.train()