Exemplo n.º 1
0
def auto_benchmark_ddpg_garage_tf():
    """Create garage TensorFlow DDPG model and training.
    Training over different environments and seeds.
    """
    @wrap_experiment
    def ddpg_garage_tf(ctxt, env_id, seed):
        """Create garage TensorFlow DDPG model and training.
        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                configuration used by LocalRunner to create the
                snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.
        """
        deterministic.set_seed(seed)

        with LocalTFRunner(ctxt) as runner:
            env = TfEnv(normalize(gym.make(env_id)))

            action_noise = OUStrategy(env.spec,
                                      sigma=hyper_parameters['sigma'])

            policy = ContinuousMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=hyper_parameters['policy_hidden_sizes'],
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.tanh)

            qf = ContinuousMLPQFunction(
                env_spec=env.spec,
                hidden_sizes=hyper_parameters['qf_hidden_sizes'],
                hidden_nonlinearity=tf.nn.relu)

            replay_buffer = SimpleReplayBuffer(
                env_spec=env.spec,
                size_in_transitions=hyper_parameters['replay_buffer_size'],
                time_horizon=hyper_parameters['n_rollout_steps'])

            algo = DDPG(env_spec=env.spec,
                        policy=policy,
                        qf=qf,
                        replay_buffer=replay_buffer,
                        steps_per_epoch=hyper_parameters['steps_per_epoch'],
                        policy_lr=hyper_parameters['policy_lr'],
                        qf_lr=hyper_parameters['qf_lr'],
                        target_update_tau=hyper_parameters['tau'],
                        n_train_steps=hyper_parameters['n_train_steps'],
                        discount=hyper_parameters['discount'],
                        min_buffer_size=int(1e4),
                        exploration_strategy=action_noise,
                        policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                        qf_optimizer=tf.compat.v1.train.AdamOptimizer)

            runner.setup(algo, env)
            runner.train(n_epochs=hyper_parameters['n_epochs'],
                         batch_size=hyper_parameters['n_rollout_steps'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            ddpg_garage_tf.__name__, tasks, seeds):
        ddpg_garage_tf(dict(log_dir=log_dir), env_id=env_id, seed=seed)
Exemplo n.º 2
0
def auto_benchmark_ppo_garage_pytorch():
    """Create garage PyTorch PPO model and training.

    Training over different environments and seeds.

    """
    @wrap_experiment
    def ppo_garage_pytorch(ctxt, env_id, seed):
        """Create garage PyTorch PPO model and training.

        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                configuration used by LocalRunner to create the
                snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        deterministic.set_seed(seed)

        runner = LocalRunner(ctxt)

        env = TfEnv(normalize(gym.make(env_id)))

        policy = PyTorch_GMP(env.spec,
                             hidden_sizes=(32, 32),
                             hidden_nonlinearity=torch.tanh,
                             output_nonlinearity=None)

        value_functions = LinearFeatureBaseline(env_spec=env.spec)

        algo = PyTorch_PPO(env_spec=env.spec,
                           policy=policy,
                           value_function=value_functions,
                           optimizer=torch.optim.Adam,
                           policy_lr=3e-4,
                           max_path_length=hyper_parameters['max_path_length'],
                           discount=0.99,
                           gae_lambda=0.95,
                           center_adv=True,
                           lr_clip_range=0.2,
                           minibatch_size=128,
                           max_optimization_epochs=10)

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            ppo_garage_pytorch,
            tasks,
            seeds,
            use_tf=False,
            xcolumn='TotalEnvSteps',
            xlabel='Total Environment Steps',
            ycolumn='Evaluation/AverageReturn',
            ylabel='Average Return'):
        ppo_garage_pytorch(dict(log_dir=log_dir), env_id=env_id, seed=seed)
Exemplo n.º 3
0
def auto_benchmark_trpo_baselines():
    """Create TRPO baselines model and training.

    Training over different environments and seeds.

    """
    def trpo_baselines(log_dir, env_id, seed):
        """Create Baseline model and training.

        Args:
            log_dir (str): Experiment log directory.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        # Set up TF Session
        ncpu = max(multiprocessing.cpu_count() // 2, 1)
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=ncpu,
                                inter_op_parallelism_threads=ncpu)
        tf.compat.v1.Session(config=config).__enter__()

        # Set up logger for baselines
        configure(dir=log_dir,
                  format_strs=['stdout', 'log', 'csv', 'tensorboard'])
        baselines_logger.info('rank {}: seed={}, logdir={}'.format(
            0, seed, baselines_logger.get_dir()))

        set_global_seeds(seed)

        env = AutoStopEnv(env_name=env_id, max_path_length=100)

        trpo_mpi.learn(network='mlp',
                       env=env,
                       total_timesteps=hyper_parameters['batch_size'] *
                       hyper_parameters['n_epochs'],
                       timesteps_per_batch=hyper_parameters['batch_size'],
                       gamma=hyper_parameters['discount'],
                       lam=hyper_parameters['gae_lambda'],
                       max_kl=hyper_parameters['max_kl'],
                       cg_iters=10,
                       cg_damping=0.1,
                       vf_iters=5,
                       vf_stepsize=1e-3)

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            trpo_baselines,
            tasks,
            seeds,
            use_tf=True,
            xcolumn='TimestepsSoFar',
            xlabel='Total Environment Steps',
            ycolumn='EpRewMean',
            ylabel='Average Return'):
        trpo_baselines(log_dir=log_dir, env_id=env_id, seed=seed)
Exemplo n.º 4
0
def auto_benchmark_trpo_garage_tf():
    """Create garage TensorFlow TRPO model and training.

    Training over different environments and seeds.

    """
    @wrap_experiment
    def trpo_garage_tf(ctxt, env_id, seed):
        """Create garage Tensorflow TROI model and training.

        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                configuration used by LocalRunner to create the
                snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        deterministic.set_seed(seed)

        with LocalTFRunner(ctxt) as runner:
            env = TfEnv(normalize(gym.make(env_id)))

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=hyper_parameters['hidden_sizes'],
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=hyper_parameters['max_path_length'],
                        discount=hyper_parameters['discount'],
                        gae_lambda=hyper_parameters['gae_lambda'],
                        max_kl_step=hyper_parameters['max_kl'])

            runner.setup(algo, env)
            runner.train(n_epochs=hyper_parameters['n_epochs'],
                         batch_size=hyper_parameters['batch_size'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            trpo_garage_tf,
            tasks,
            seeds,
            use_tf=True,
            xcolumn='TotalEnvSteps',
            xlabel='Total Environment Steps',
            ycolumn='Evaluation/AverageReturn',
            ylabel='Average Return'):
        trpo_garage_tf(dict(log_dir=log_dir), env_id=env_id, seed=seed)
Exemplo n.º 5
0
def auto_benchmark_trpo_garage_pytorch():
    """Create garage PyTorch TRPO model and training.

    Training over different environments and seeds.

    """

    @wrap_experiment
    def trpo_garage_pytorch(ctxt, env_id, seed):
        """Create garage PyTorch TRPO model and training.

        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                    configuration used by LocalRunner to create the
                    snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        deterministic.set_seed(seed)

        runner = LocalRunner(ctxt)

        env = TfEnv(normalize(gym.make(env_id)))

        policy = PyTorch_GMP(env.spec,
                             hidden_sizes=hyper_parameters['hidden_sizes'],
                             hidden_nonlinearity=torch.tanh,
                             output_nonlinearity=None)

        value_function = GaussianMLPValueFunction(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=torch.tanh,
            output_nonlinearity=None)

        algo = PyTorch_TRPO(
            env_spec=env.spec,
            policy=policy,
            value_function=value_function,
            max_path_length=hyper_parameters['max_path_length'],
            discount=hyper_parameters['discount'],
            gae_lambda=hyper_parameters['gae_lambda'])

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            trpo_garage_pytorch.__name__, tasks, seeds):
        trpo_garage_pytorch(dict(log_dir=log_dir), env_id=env_id, seed=seed)
Exemplo n.º 6
0
def auto_benchmark_vpg_garage_tf():
    """Create garage TensorFlow VPG model and training.

    Training over different environments and seeds.

    """

    @wrap_experiment
    def vpg_garage_tf(ctxt, env_id, seed):
        """Create garage TensorFlow VPG model and training.

        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                configuration used by LocalRunner to create the
                snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        deterministic.set_seed(seed)

        with LocalTFRunner(ctxt) as runner:
            env = TfEnv(normalize(gym.make(env_id)))

            policy = TF_GMP(
                env_spec=env.spec,
                hidden_sizes=hyper_parameters['hidden_sizes'],
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TF_VPG(env_spec=env.spec,
                          policy=policy,
                          baseline=baseline,
                          max_path_length=hyper_parameters['max_path_length'],
                          discount=hyper_parameters['discount'],
                          center_adv=hyper_parameters['center_adv'],
                          optimizer_args=dict(tf_optimizer_args=dict(
                              learning_rate=hyper_parameters['learning_rate']),
                                              verbose=True))

            runner.setup(algo, env)
            runner.train(n_epochs=hyper_parameters['n_epochs'],
                         batch_size=hyper_parameters['batch_size'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            vpg_garage_tf, tasks, seeds):
        vpg_garage_tf(dict(log_dir=log_dir), env_id=env_id, seed=seed)
Exemplo n.º 7
0
def auto_benchmark_vpg_garage_pytorch():
    """Create garage PyTorch VPG model and training.

    Training over different environments and seeds.

    """

    @wrap_experiment
    def vpg_garage_pytorch(ctxt, env_id, seed):
        """Create garage PyTorch VPG model and training.

        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                configuration used by LocalRunner to create the
                snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        deterministic.set_seed(seed)

        runner = LocalRunner(ctxt)

        env = TfEnv(normalize(gym.make(env_id)))

        policy = PyTorch_GMP(env.spec,
                             hidden_sizes=hyper_parameters['hidden_sizes'],
                             hidden_nonlinearity=torch.tanh,
                             output_nonlinearity=None)

        value_function = LinearFeatureBaseline(env_spec=env.spec)

        algo = PyTorch_VPG(env_spec=env.spec,
                           policy=policy,
                           optimizer=torch.optim.Adam,
                           policy_lr=hyper_parameters['learning_rate'],
                           value_function=value_function,
                           max_path_length=hyper_parameters['max_path_length'],
                           discount=hyper_parameters['discount'],
                           center_adv=hyper_parameters['center_adv'])

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            vpg_garage_pytorch, tasks, seeds):
        vpg_garage_pytorch(dict(log_dir=log_dir), env_id=env_id, seed=seed)
Exemplo n.º 8
0
def auto_benchmark_ppo_garage_tf():
    """Create garage TensorFlow PPO model and training.

    Training over different environments and seeds.

    """
    @wrap_experiment
    def ppo_garage_tf(ctxt, env_id, seed):
        """Create garage TensorFlow PPO model and training.

        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                configuration used by LocalRunner to create the
                snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        deterministic.set_seed(seed)

        with LocalTFRunner(ctxt) as runner:
            env = TfEnv(normalize(gym.make(env_id)))

            policy = TF_GMP(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )

            baseline = TF_GMB(
                env_spec=env.spec,
                regressor_args=dict(
                    hidden_sizes=(32, 32),
                    use_trust_region=False,
                    optimizer=FirstOrderOptimizer,
                    optimizer_args=dict(
                        batch_size=32,
                        max_epochs=10,
                        tf_optimizer_args=dict(learning_rate=3e-4),
                    ),
                ),
            )

            algo = TF_PPO(env_spec=env.spec,
                          policy=policy,
                          baseline=baseline,
                          max_path_length=hyper_parameters['max_path_length'],
                          discount=0.99,
                          gae_lambda=0.95,
                          center_adv=True,
                          lr_clip_range=0.2,
                          optimizer_args=dict(
                              batch_size=32,
                              max_epochs=10,
                              tf_optimizer_args=dict(learning_rate=3e-4),
                              verbose=True))

            runner.setup(algo, env)
            runner.train(n_epochs=hyper_parameters['n_epochs'],
                         batch_size=hyper_parameters['batch_size'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            ppo_garage_tf.__name__, tasks, seeds):
        ppo_garage_tf(dict(log_dir=log_dir), env_id=env_id, seed=seed)
Exemplo n.º 9
0
def auto_benchmark_ppo_baselines():
    """Create PPO baselines model and training.

    Training over different environments and seeds.

    """
    def ppo_baselines(log_dir, env_id, seed):
        """Create baselines model and training.

        Args:
            log_dir (str): Experiment log directory.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        # Set up TF Session
        ncpu = max(multiprocessing.cpu_count() // 2, 1)
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=ncpu,
                                inter_op_parallelism_threads=ncpu)
        tf.compat.v1.Session(config=config).__enter__()

        # Set up baselines logger
        configure(dir=log_dir,
                  format_strs=['stdout', 'log', 'csv', 'tensorboard'])
        baselines_logger.info('rank {}: seed={}, logdir={}'.format(
            0, seed, baselines_logger.get_dir()))

        set_global_seeds(seed)

        env = DummyVecEnv([
            lambda: bench.Monitor(gym.make(env_id),
                                  baselines_logger.get_dir(),
                                  allow_early_resets=True)
        ])

        ppo2.learn(network='mlp',
                   env=env,
                   nsteps=hyper_parameters['batch_size'],
                   nminibatches=32,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=10,
                   log_interval=1,
                   ent_coef=0.0,
                   max_grad_norm=None,
                   lr=3e-4,
                   cliprange=0.2,
                   total_timesteps=(hyper_parameters['batch_size'] *
                                    hyper_parameters['n_epochs']))

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            ppo_baselines,
            tasks,
            seeds,
            use_tf=True,
            xcolumn='misc/total_timesteps',
            xlabel='Total Environment Steps',
            ycolumn='eprewmean',
            ylabel='Average Return'):
        ppo_baselines(log_dir=log_dir, env_id=env_id, seed=seed)
Exemplo n.º 10
0
def auto_benchmark_ppo_garage_pytorch():
    """Create garage PyTorch PPO model and training.

    Training over different environments and seeds.

    """
    @wrap_experiment
    def ppo_garage_pytorch(ctxt, env_id, seed):
        """Create garage PyTorch PPO model and training.

        Args:
            ctxt (garage.experiment.ExperimentContext): The experiment
                configuration used by LocalRunner to create the
                snapshotter.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        deterministic.set_seed(seed)

        runner = LocalRunner(ctxt)

        env = TfEnv(normalize(gym.make(env_id)))

        policy = PyTorch_GMP(env.spec,
                             hidden_sizes=(32, 32),
                             hidden_nonlinearity=torch.tanh,
                             output_nonlinearity=None)

        value_function = GaussianMLPValueFunction(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=torch.tanh,
            output_nonlinearity=None)

        policy_optimizer = OptimizerWrapper(
            (torch.optim.Adam, dict(lr=2.5e-4)),
            policy,
            max_optimization_epochs=10,
            minibatch_size=64)
        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                        value_function,
                                        max_optimization_epochs=10,
                                        minibatch_size=64)

        algo = PyTorch_PPO(env_spec=env.spec,
                           policy=policy,
                           value_function=value_function,
                           policy_optimizer=policy_optimizer,
                           vf_optimizer=vf_optimizer,
                           max_path_length=hyper_parameters['max_path_length'],
                           discount=0.99,
                           gae_lambda=0.95,
                           center_adv=True,
                           lr_clip_range=0.2)

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

    for env_id, seed, log_dir in benchmark_helper.iterate_experiments(
            ppo_garage_pytorch.__name__, tasks, seeds):
        ppo_garage_pytorch(dict(log_dir=log_dir), env_id=env_id, seed=seed)