Exemplo n.º 1
0
def run_train_task(vv):

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency'])

    policy = MPCController(
        name="policy",
        env=env,
        dynamics_model=dynamics_model,
        discount=vv['discount'],
        n_candidates=vv['n_candidates'],
        horizon=vv['horizon'],
    )

    algo = ModelMPCBatchPolopt(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        batch_size_env_samples=vv['batch_size_env_samples'],
        initial_random_samples=vv['initial_random_samples'],
        dynamic_model_max_epochs=vv['dynamic_model_epochs'],
        max_path_length=vv['path_length'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        step_size=vv["step_size"],
        reinit_model_cycle=vv['reinit_model_cycle'])
    algo.train()
Exemplo n.º 2
0
def run_train_task(vv):
    env = TfEnv(normalize(vv['env']()))

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes'],
        grad_step_size=vv['fast_lr'],
        hidden_nonlinearity=vv['hidden_nonlinearity'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        parallel_sampler=vv['parallel_sampler'],
    )
    algo.train()
def run_train_task(vv):
    env = TfEnv(normalize(vv['env']()))

    policy = vv['policy'](name="policy",
                          env_spec=env.spec,
                          num_tasks=vv['meta_batch_size'],
                          hidden_sizes=vv['hidden_sizes'],
                          grad_step_size=vv['fast_lr'],
                          hidden_nonlinearity=vv['hidden_nonlinearity'],
                          trainable_step_size=vv['trainable_step_size'],
                          bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_args = dict(max_epochs=vv['max_epochs'], )

    algo = vv['algo'](
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        entropy_bonus=vv['entropy_bonus'],
        clip_eps=vv['clip_eps'],
        target_inner_step=vv['target_inner_step'],
        init_kl_penalty=vv['init_kl_penalty'],
        optimizer_args=optimizer_args,
    )
    algo.train()
def run_train_task(vv):

    env = TfEnv(normalize(vv['env'](
        log_scale_limit=vv['log_scale_limit'],
        target_velocity=vv['target_velocity'],
    )))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        optimizer=vv['optimizer_model'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency']
    )

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        num_tasks=vv['meta_batch_size']
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv.get('reset_from_env_traj', False),
        max_path_length_env=vv['path_length_env'],
        max_path_length_dyn=vv.get('path_length_dyn', None),
        dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs', (500, 500)),
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv['retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        log_real_performance=True,
        clip_obs=vv.get('clip_obs', True),
        entropy_bonus=vv['entropy_bonus'],
        tailored_exploration=vv['tailored_exploration']
    )
    algo.train()
Exemplo n.º 5
0
def run_eval_task(vv):

    # load policy and baseline- Warning: resets the tf graph
    # also returns the tensorflow session which must be used in the further code
    baseline, env, params_pickle_file = eval.load_baseline_and_env(vv)

    tf.reset_default_graph()

    # fix the mujoco parameters
    env_class = eval.get_env_class(env)
    env = TfEnv(
        normalize(
            env_class(log_scale_limit=vv["log_scale_limit"],
                      fix_params=True,
                      random_seed=vv['env_param_seed'])))

    step_size = vv['fast_lr']

    policy = None

    algo = VPG(env=env,
               policy=policy,
               load_policy=params_pickle_file,
               baseline=baseline,
               batch_size=20000,
               max_path_length=100,
               n_itr=5,
               optimizer_args={
                   'init_learning_rate': step_size,
                   'tf_optimizer_args': {
                       'learning_rate': step_size
                   },
                   'tf_optimizer_cls': tf.train.GradientDescentOptimizer
               })
    algo.train()
Exemplo n.º 6
0
def run_train_task(vv):
    import sys
    print(vv['exp_prefix'])
    sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'],
                                   vv['exp_name'], 'stdout.log')
    sysout_log_file = open(sysout_log_path, 'w')
    sys.stdout = sysout_log_file

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        optimizer=vv['optimizer_model'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency'])

    policy = MAMLImprovedGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        param_noise_std=vv['param_noise_std'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv.get('reset_from_env_traj', False),
        max_path_length_env=vv['path_length_env'],
        max_path_length_dyn=vv.get('path_length_dyn', None),
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        clip_obs=vv.get('clip_obs', True))
    algo.train()

    sysout_log_file.close()
def run_train_task(vv):

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = BadDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        optimizer=vv['optimizer_model'],
        output_bias_range=vv['output_bias_range'],
        gaussian_noise_output_std=vv['output_noise_std'],
    )

    policy = MAMLImprovedGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        param_noise_std=vv['param_noise_std'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        dynamic_model_epochs=vv['dynamic_model_epochs'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv['reset_from_env_traj'],
        max_path_length_env=vv['path_length_env'],
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        log_real_performance=True,
        resample_output_bias=vv['resample_output_bias'])
    algo.train()
def run_train_task(vv):

    env = TfEnv(
        normalize(vv['env_class'](
            fix_goal=vv['fix_goal'],
            reward_type=vv['reward_type'],
            init_puck_low=INIT_PUCK_TARGET - vv['init_slack'],
            init_puck_high=INIT_PUCK_TARGET + vv['init_slack'],
            puck_goal_low=PUCK_GOAL_TARGET - vv['goal_slack'],
            puck_goal_high=PUCK_GOAL_TARGET + vv['goal_slack'],
        )))

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        num_tasks=vv['meta_batch_size'],
        hidden_sizes=vv['hidden_sizes'],
        grad_step_size=vv['fast_lr'],
        hidden_nonlinearity=vv['hidden_nonlinearity'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_args = dict(
        max_epochs=vv['max_epochs'],
        batch_size=vv['num_batches'],
        tf_optimizer_args=dict(learning_rate=vv['outer_lr']),
    )

    algo = MAMLPPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        entropy_bonus=vv['entropy_bonus'],
        clip_eps=vv['clip_eps'],
        clip_outer=vv['clip_outer'],
        target_outer_step=vv['target_outer_step'],
        target_inner_step=vv['target_inner_step'],
        init_outer_kl_penalty=vv['init_outer_kl_penalty'],
        init_inner_kl_penalty=vv['init_inner_kl_penalty'],
        adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'],
        adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'],
        parallel_sampler=vv['parallel_sampler'],
        optimizer_args=optimizer_args,
    )
    algo.train()
def run_train_task(vv):
    import sys
    print(vv['exp_prefix'])
    sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'],
                                   vv['exp_name'], 'stdout.log')
    sysout_log_file = open(sysout_log_path, 'w')
    sys.stdout = sysout_log_file

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency'])

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        initial_random_samples=vv['initial_random_samples'],
        num_gradient_steps_per_iter=vv['num_gradient_steps_per_iter'],
        max_path_length=vv['path_length'],
        n_itr=vv['n_itr'],
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        discount=vv['discount'],
        step_size=vv["step_size"],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'])
    algo.train()

    sysout_log_file.close()
Exemplo n.º 10
0
def run_train_task(vv):

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = BadDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        output_bias_range=vv['output_bias_range'],
        gaussian_noise_output_std=vv['output_noise_std'],
    )

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        initial_random_samples=vv['initial_random_samples'],
        dynamic_model_epochs=vv['dynamic_model_epochs'],
        num_gradient_steps_per_iter=vv['num_gradient_steps_per_iter'],
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        max_path_length=vv['path_length'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        step_size=vv["step_size"],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        resample_output_bias=vv['resample_output_bias'])
    algo.train()
def run_train_task(vv):
    env = TfEnv(normalize(vv['env']()))

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        num_tasks=vv['meta_batch_size'],
        hidden_sizes=vv['hidden_sizes'],
        grad_step_size=vv['fast_lr'],
        hidden_nonlinearity=vv['hidden_nonlinearity'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLPPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=vv['fast_batch_size'],  # number of trajs for grad update
        max_path_length=vv['path_length'],
        meta_batch_size=vv['meta_batch_size'],
        num_grad_updates=vv['num_grad_updates'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        entropy_bonus=vv['entropy_bonus'],
        clip_eps=vv['clip_eps'],
        clip_outer=vv['clip_outer'],
        target_outer_step=vv['target_outer_step'],
        target_inner_step=vv['target_inner_step'],
        init_outer_kl_penalty=vv['init_outer_kl_penalty'],
        init_inner_kl_penalty=vv['init_inner_kl_penalty'],
        adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'],
        adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'],
        max_epochs=vv['max_epochs'],
        num_batches=vv['num_batches'],
        tf_optimizer_args=dict(learning_rate=vv['outer_lr']),
        parallel_sampler=vv['parallel_sampler'],
        multi_adam=vv['multi_adam'],
    )
    algo.train()
def run_train_task(vv):

    env = TfEnv(
        normalize(vv['env'](init_sampling_boundaries=vv['point_env_setup']
                            ['init_sampling_boundaries'],
                            goal=vv['point_env_setup']['goal'])))

    dynamics_model = PointEnvFakeModelEnsemble(
        env_spec=env.spec,
        num_models=vv['num_models'],
        error_range_around_goal=vv['fake_model_setup']
        ['error_range_around_goal'],
        bias_range=vv['fake_model_setup']['bias_range'],
        error_std=vv['fake_model_setup']['error_std'],
        goal=vv['point_env_setup']['goal'],
        error_at_goal=vv['fake_model_setup']['error_at_goal'],
        smooth_error=vv['smooth_error'],
    )

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
        grad_step_size=vv['fast_lr'],
        trainable_step_size=vv['trainable_step_size'],
        bias_transform=vv['bias_transform'],
        num_tasks=vv['meta_batch_size'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelMAMLTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        n_itr=vv['n_itr'],
        n_iter=vv['n_itr'],
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        meta_batch_size=vv['meta_batch_size'],
        initial_random_samples=vv['initial_random_samples'],
        num_maml_steps_per_iter=vv['num_maml_steps_per_iter'],
        reset_from_env_traj=vv.get('reset_from_env_traj', False),
        max_path_length_env=vv['path_length_env'],
        max_path_length_dyn=vv.get('path_length_dyn', None),
        dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs',
                                        (500, 500)),
        discount=vv['discount'],
        step_size=vv["meta_step_size"],
        num_grad_updates=1,
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'],
        frac_gpu=vv.get('frac_gpu', 0.85),
        log_real_performance=True,
        clip_obs=vv.get('clip_obs', True),
        entropy_bonus=vv['entropy_bonus'],
    )
    algo.train()