Exemplo n.º 1
0
def run_train_task(vv):
    env = TfEnv(normalize(vv['env']()))

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes'],
        grad_step_size=vv['fast_lr'],
        hidden_nonlinearity=vv['hidden_nonlinearity'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        parallel_sampler=vv['parallel_sampler'],
    )
    algo.train()
def run_train_task(vv):

    env = TfEnv(normalize(vv['env'](
        log_scale_limit=vv['log_scale_limit'],
        target_velocity=vv['target_velocity'],
    )))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        optimizer=vv['optimizer_model'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency']
    )

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        num_tasks=vv['meta_batch_size']
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv.get('reset_from_env_traj', False),
        max_path_length_env=vv['path_length_env'],
        max_path_length_dyn=vv.get('path_length_dyn', None),
        dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs', (500, 500)),
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv['retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        log_real_performance=True,
        clip_obs=vv.get('clip_obs', True),
        entropy_bonus=vv['entropy_bonus'],
        tailored_exploration=vv['tailored_exploration']
    )
    algo.train()
def run_train_task(vv):
    env = TfEnv(normalize(vv['env']()))

    policy = vv['policy'](name="policy",
                          env_spec=env.spec,
                          num_tasks=vv['meta_batch_size'],
                          hidden_sizes=vv['hidden_sizes'],
                          grad_step_size=vv['fast_lr'],
                          hidden_nonlinearity=vv['hidden_nonlinearity'],
                          trainable_step_size=vv['trainable_step_size'],
                          bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_args = dict(max_epochs=vv['max_epochs'], )

    algo = vv['algo'](
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        entropy_bonus=vv['entropy_bonus'],
        clip_eps=vv['clip_eps'],
        target_inner_step=vv['target_inner_step'],
        init_kl_penalty=vv['init_kl_penalty'],
        optimizer_args=optimizer_args,
    )
    algo.train()
Exemplo n.º 4
0
def run_train_task(vv):
    import sys
    print(vv['exp_prefix'])
    sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'],
                                   vv['exp_name'], 'stdout.log')
    sysout_log_file = open(sysout_log_path, 'w')
    sys.stdout = sysout_log_file

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        optimizer=vv['optimizer_model'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency'])

    policy = MAMLImprovedGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        param_noise_std=vv['param_noise_std'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv.get('reset_from_env_traj', False),
        max_path_length_env=vv['path_length_env'],
        max_path_length_dyn=vv.get('path_length_dyn', None),
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        clip_obs=vv.get('clip_obs', True))
    algo.train()

    sysout_log_file.close()
def run_train_task(vv):

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = BadDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        optimizer=vv['optimizer_model'],
        output_bias_range=vv['output_bias_range'],
        gaussian_noise_output_std=vv['output_noise_std'],
    )

    policy = MAMLImprovedGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        param_noise_std=vv['param_noise_std'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        dynamic_model_epochs=vv['dynamic_model_epochs'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv['reset_from_env_traj'],
        max_path_length_env=vv['path_length_env'],
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        log_real_performance=True,
        resample_output_bias=vv['resample_output_bias'])
    algo.train()
def run_train_task(vv):

    env = TfEnv(
        normalize(vv['env_class'](
            fix_goal=vv['fix_goal'],
            reward_type=vv['reward_type'],
            init_puck_low=INIT_PUCK_TARGET - vv['init_slack'],
            init_puck_high=INIT_PUCK_TARGET + vv['init_slack'],
            puck_goal_low=PUCK_GOAL_TARGET - vv['goal_slack'],
            puck_goal_high=PUCK_GOAL_TARGET + vv['goal_slack'],
        )))

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        num_tasks=vv['meta_batch_size'],
        hidden_sizes=vv['hidden_sizes'],
        grad_step_size=vv['fast_lr'],
        hidden_nonlinearity=vv['hidden_nonlinearity'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_args = dict(
        max_epochs=vv['max_epochs'],
        batch_size=vv['num_batches'],
        tf_optimizer_args=dict(learning_rate=vv['outer_lr']),
    )

    algo = MAMLPPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        entropy_bonus=vv['entropy_bonus'],
        clip_eps=vv['clip_eps'],
        clip_outer=vv['clip_outer'],
        target_outer_step=vv['target_outer_step'],
        target_inner_step=vv['target_inner_step'],
        init_outer_kl_penalty=vv['init_outer_kl_penalty'],
        init_inner_kl_penalty=vv['init_inner_kl_penalty'],
        adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'],
        adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'],
        parallel_sampler=vv['parallel_sampler'],
        optimizer_args=optimizer_args,
    )
    algo.train()
    def test_param_space_noise(self):
        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(name="policy33",
                                               env_spec=env.spec,
                                               hidden_sizes=(16, 16),
                                               hidden_nonlinearity=tf.nn.tanh,
                                               param_noise_std=0.0)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        import rllab.misc.logger as logger

        logger.set_snapshot_dir('/tmp/')
        logger.set_snapshot_mode('last')

        algo = MAMLTRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=3,
            max_path_length=10,
            meta_batch_size=4,
            num_grad_updates=1,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
        )
        algo.train()

        tf.reset_default_graph()
        pkl_file = os.path.join('/tmp/', 'params.pkl')
        with tf.Session() as sess:
            data = joblib.load(pkl_file)
            policy = data['policy']
            action_1 = policy.get_action(obs)[1]['mean']
            action_2 = policy.get_action(obs)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)

            self.assertAlmostEquals(diff, 0.0)

            action_1 = policy.get_action(obs, param_noise_std=1.0)[1]['mean']
            action_2 = policy.get_action(obs, param_noise_std=1.0)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)

            self.assertGreaterEqual(diff, 0.1)

            policy.param_noise_std = 1.0
            action_1 = policy.get_action(obs)[1]['mean']
            action_2 = policy.get_action(obs)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)
            self.assertGreaterEqual(diff, 0.1)
    def test_serialization(self):

        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy56",
            env_spec=env.spec,
            hidden_sizes=(16, 16),
            hidden_nonlinearity=tf.nn.tanh,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        import rllab.misc.logger as logger

        logger.set_snapshot_dir('/tmp/')
        logger.set_snapshot_mode('last')

        algo = MAMLTRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2,
            max_path_length=10,
            meta_batch_size=4,
            num_grad_updates=1,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
        )
        algo.train()

        tf.reset_default_graph()
        pkl_file = os.path.join('/tmp/', 'params.pkl')
        with tf.Session() as sess:
            data = joblib.load(pkl_file)
            policy = data['policy']
            action_before = policy.get_action(obs)[1]['mean']

            dump_string = pickle.dumps(policy)

        tf.reset_default_graph()
        with tf.Session() as sess:
            policy_loaded = pickle.loads(dump_string)
            action_after = policy_loaded.get_action(obs)[1]['mean']

        diff = np.sum(np.abs(action_before - action_after))
        self.assertAlmostEquals(diff, 0.0, places=3)
def run_train_task(vv):
    env = TfEnv(normalize(vv['env']()))

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        num_tasks=vv['meta_batch_size'],
        hidden_sizes=vv['hidden_sizes'],
        grad_step_size=vv['fast_lr'],
        hidden_nonlinearity=vv['hidden_nonlinearity'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLPPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        entropy_bonus=vv['entropy_bonus'],
        clip_eps=vv['clip_eps'],
        clip_outer=vv['clip_outer'],
        target_outer_step=vv['target_outer_step'],
        target_inner_step=vv['target_inner_step'],
        init_outer_kl_penalty=vv['init_outer_kl_penalty'],
        init_inner_kl_penalty=vv['init_inner_kl_penalty'],
        adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'],
        adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'],
        max_epochs=vv['max_epochs'],
        num_batches=vv['num_batches'],
        tf_optimizer_args=dict(learning_rate=vv['outer_lr']),
        parallel_sampler=vv['parallel_sampler'],
        multi_adam=vv['multi_adam'],
    )
    algo.train()
            env = normalize(PointEnvRandGoalOracle(goal=goal))
            n_itr = 1
        else:
            env = normalize(PointEnvRandGoal(goal=goal))
            n_itr = 5
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = VPG(
            env=env,
            policy=policy,
            load_policy=initial_params_file,
            baseline=baseline,
            batch_size=4000,  # 2x
            max_path_length=100,
            n_itr=n_itr,
            optimizer_args={
                'init_learning_rate': step_sizes[step_i],
                'tf_optimizer_args': {
                    'learning_rate': 0.5 * step_sizes[step_i]
                },
                'tf_optimizer_cls': tf.train.GradientDescentOptimizer
            })
def run_train_task(vv):

    env = TfEnv(
        normalize(vv['env'](init_sampling_boundaries=vv['point_env_setup']
                            ['init_sampling_boundaries'],
                            goal=vv['point_env_setup']['goal'])))

    dynamics_model = PointEnvFakeModelEnsemble(
        env_spec=env.spec,
        num_models=vv['num_models'],
        error_range_around_goal=vv['fake_model_setup']
        ['error_range_around_goal'],
        bias_range=vv['fake_model_setup']['bias_range'],
        error_std=vv['fake_model_setup']['error_std'],
        goal=vv['point_env_setup']['goal'],
        error_at_goal=vv['fake_model_setup']['error_at_goal'],
        smooth_error=vv['smooth_error'],
    )

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        num_tasks=vv['meta_batch_size'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv.get('reset_from_env_traj', False),
        max_path_length_env=vv['path_length_env'],
        max_path_length_dyn=vv.get('path_length_dyn', None),
        dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs',
                                        (500, 500)),
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        log_real_performance=True,
        clip_obs=vv.get('clip_obs', True),
        entropy_bonus=vv['entropy_bonus'],
    )
    algo.train()