예제 #1
0
def test_spota_ppo(env, spota_hparam, ex_dir):
    # Environment and domain randomization
    randomizer = get_default_randomizer(env)
    env = DomainRandWrapperBuffer(env, randomizer)

    # Policy and subroutines
    policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh)
    value_fcn = FNN(input_size=env.obs_space.flat_dim,
                    output_size=1,
                    hidden_sizes=[16, 16],
                    hidden_nonlin=to.tanh)
    critic_hparam = dict(gamma=0.998,
                         lamda=0.95,
                         num_epoch=3,
                         batch_size=64,
                         lr=1e-3)
    critic_cand = GAE(value_fcn, **critic_hparam)
    critic_refs = GAE(deepcopy(value_fcn), **critic_hparam)

    subrtn_hparam_cand = dict(
        # min_rollouts=0,  # will be overwritten by SPOTA
        min_steps=0,  # will be overwritten by SPOTA
        max_iter=2,
        num_epoch=3,
        eps_clip=0.1,
        batch_size=64,
        num_sampler_envs=4,
        std_init=0.5,
        lr=1e-2)
    subrtn_hparam_cand = subrtn_hparam_cand

    sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_cand)
    sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs,
                  **subrtn_hparam_cand)

    # Create algorithm and train
    algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam)
    algo.train()
예제 #2
0
def test_adr(env, ex_dir, subrtn_hparam, actor_hparam, value_fcn_hparam,
             critic_hparam, adr_hparam):
    # Create the subroutine for the meta-algorithm
    actor = FNNPolicy(spec=env.spec, **actor_hparam)
    value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                          **value_fcn_hparam)
    critic = GAE(value_fcn, **critic_hparam)
    subroutine = PPO(ex_dir, env, actor, critic, **subrtn_hparam)

    # Create algorithm and train
    particle_hparam = dict(actor=actor_hparam,
                           value_fcn=value_fcn_hparam,
                           critic=critic_hparam)
    algo = ADR(ex_dir,
               env,
               subroutine,
               svpg_particle_hparam=particle_hparam,
               **adr_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
예제 #3
0
def test_actor_critic(env, linear_policy, ex_dir, algo, algo_hparam,
                      value_fcn_type, use_cuda):
    # Create value function
    if value_fcn_type == 'fnn-plain':
        value_fcn = FNN(input_size=env.obs_space.flat_dim,
                        output_size=1,
                        hidden_sizes=[16, 16],
                        hidden_nonlin=to.tanh,
                        use_cuda=use_cuda)
    else:
        vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace)
        if value_fcn_type == 'fnn':
            value_fcn = FNNPolicy(vf_spec,
                                  hidden_sizes=[16, 16],
                                  hidden_nonlin=to.tanh,
                                  use_cuda=use_cuda)
        else:
            value_fcn = RNNPolicy(vf_spec,
                                  hidden_size=16,
                                  num_recurrent_layers=1,
                                  use_cuda=use_cuda)

    # Create critic
    critic_hparam = dict(
        gamma=0.98,
        lamda=0.95,
        batch_size=32,
        lr=1e-3,
        standardize_adv=False,
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Common hyper-parameters
    common_hparam = dict(max_iter=3, min_rollouts=3, num_sampler_envs=1)
    # Add specific hyper parameters if any
    common_hparam.update(algo_hparam)

    # Create algorithm and train
    algo = algo(ex_dir, env, linear_policy, critic, **common_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
예제 #4
0
def test_snapshots_notmeta(ex_dir, env, policy, algo_class, algo_hparam):
    # Collect hyper-parameters, create algorithm, and train
    common_hparam = dict(max_iter=1, num_sampler_envs=1)
    common_hparam.update(algo_hparam)

    if issubclass(algo_class, ActorCritic):
        common_hparam.update(
            min_rollouts=3,
            critic=GAE(value_fcn=FNNPolicy(spec=EnvSpec(
                env.obs_space, ValueFunctionSpace),
                                           hidden_sizes=[16, 16],
                                           hidden_nonlin=to.tanh)))
    elif issubclass(algo_class, ParameterExploring):
        common_hparam.update(num_rollouts=1)
    else:
        raise NotImplementedError

    # Train
    algo = algo_class(ex_dir, env, policy, **common_hparam)
    algo.train()
    if isinstance(algo, ActorCritic):
        policy_posttrn_param_values = algo.policy.param_values
        critic_posttrn_value_fcn_param_values = algo.critic.value_fcn.param_values
    elif isinstance(algo, ParameterExploring):
        policy_posttrn_param_values = algo.best_policy_param

    # Save and load
    algo.save_snapshot(meta_info=None)
    algo.load_snapshot(load_dir=ex_dir, meta_info=None)
    policy_loaded = deepcopy(algo.policy)

    # Check
    assert all(policy_posttrn_param_values == policy_loaded.param_values)
    if algo_class in [A2C, PPO, PPO2]:
        critic_loaded = deepcopy(algo.critic)
        assert all(critic_posttrn_value_fcn_param_values ==
                   critic_loaded.value_fcn.param_values)
예제 #5
0
                            num_recurrent_layers=1)  # LSTM & GRU
    # value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam)
    # value_fcn = RNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam)
    # value_fcn = LSTMPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam)
    value_fcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                          **value_fcn_hparam)
    critic_hparam = dict(
        gamma=0.995,
        lamda=0.98,
        num_epoch=1,
        batch_size=100,
        lr=1e-4,
        standardize_adv=False,
        max_grad_norm=1.,
    )
    critic_cand = GAE(value_fcn, **critic_hparam)
    critic_refs = GAE(deepcopy(value_fcn), **critic_hparam)

    subrtn_hparam_cand = dict(
        max_iter=400,
        # min_rollouts=0,  # will be overwritten by SPOTA
        min_steps=0,  # will be overwritten by SPOTA
        num_epoch=1,
        eps_clip=0.1,
        batch_size=100,
        std_init=0.8,
        max_grad_norm=1.,
        lr=1e-4,
    )
    subrtn_hparam_cand = subrtn_hparam_cand
예제 #6
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1/250., max_steps=1500)
    env = ActNormWrapper(env)

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )

    # Critic
    value_fcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    critic_hparam = dict(
        gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        batch_size=100,
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=500,
        min_steps=25*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        batch_size=100,
        std_init=0.9,
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret
예제 #7
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_epoch: int = 3,
                 eps_clip: float = 0.1,
                 value_fcn_coeff: float = 0.5,
                 entropy_coeff: float = 1e-3,
                 batch_size: int = 32,
                 std_init: float = 1.0,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = None,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epoch: number of iterations over all gathered samples during one policy update
        :param eps_clip: max/min probability ratio, see [1]
        :param value_fcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2
        :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created

        .. note::
            The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler
            schedules the maximum learning rate.
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)

        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)
        critic.standardize_adv = True  # enforce this for PPO2

        # Store the inputs
        self.num_epoch = num_epoch
        self.eps_clip = eps_clip
        self.value_fcn_coeff = value_fcn_coeff
        self.entropy_coeff = entropy_coeff
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self.log_loss = True
        self._expl_strat = NormalActNoiseExplStrat(self._policy,
                                                   std_init=std_init)
        self.sampler = ParallelSampler(env,
                                       self._expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=min_steps,
                                       min_rollouts=min_rollouts)
        self.optim = to.optim.Adam(
            [{
                'params': self._expl_strat.policy.parameters()
            }, {
                'params': self._expl_strat.noise.parameters()
            }, {
                'params': self._critic.value_fcn.parameters()
            }],
            lr=lr,
            eps=1e-5)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)
예제 #8
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 particle_hparam: dict,
                 max_iter: int,
                 num_particles: int,
                 temperature: float,
                 lr: float,
                 horizon: int,
                 std_init: float = 1.0,
                 min_rollouts: int = None,
                 min_steps: int = 10000,
                 num_sampler_envs: int = 4,
                 serial: bool = True,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param particle_hparam: hyper-parameters for particle template construction
        :param max_iter: number of iterations
        :param num_particles: number of distinct particles
        :param temperature: the temperature of the SVGD determines how jointly the training takes place
        :param lr: the learning rate for the update of the particles
        :param horizon: horizon for each particle
        :param std_init: initial standard deviation for the exploration
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(particle_hparam, dict):
            raise pyrado.TypeErr(given=particle_hparam, expected_type=dict)
        if not all([
                key in particle_hparam
                for key in ['actor', 'value_fcn', 'critic']
        ]):
            raise AttributeError

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy=None, logger=logger)

        # Store the inputs
        self._env = env
        self.num_particles = num_particles
        self.horizon = horizon  # TODO @Robin: where is the horizon used?!
        self.lr = lr
        self.temperature = temperature
        self.serial = serial

        # Prepare placeholders for particles
        self.particles = [None] * num_particles
        self.expl_strats = [None] * num_particles
        self.optimizers = [None] * num_particles
        self.fixed_particles = [None] * num_particles
        self.fixed_expl_strats = [None] * num_particles
        self.samplers = [None] * num_particles
        self.count = 0
        self.updatecount = 0

        # Particle factory
        actor = FNNPolicy(spec=env.spec, **particle_hparam['actor'])
        value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                              **particle_hparam['value_fcn'])
        critic = GAE(value_fcn, **particle_hparam['critic'])
        particle = SVPGParticle(env.spec, actor, critic)

        for i in range(self.num_particles):
            self.particles[i] = deepcopy(particle)
            self.particles[i].init_param()
            self.expl_strats[i] = NormalActNoiseExplStrat(
                self.particles[i].actor, std_init)
            self.optimizers[i] = to.optim.Adam(
                self.expl_strats[i].parameters(), lr=self.lr)
            self.fixed_particles[i] = deepcopy(self.particles[i])
            self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i])

            if self.serial:
                self.samplers[i] = ParallelSampler(env,
                                                   self.expl_strats[i],
                                                   num_sampler_envs,
                                                   min_rollouts=min_rollouts,
                                                   min_steps=min_steps)
예제 #9
0
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # Critic
    value_fcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                          **value_fcn_hparam)
    critic_hparam = dict(
        gamma=0.995,
        lamda=0.95,
        num_epoch=10,
        batch_size=512,
        standardize_adv=False,
        standardizer=None,
        max_grad_norm=1.,
        lr=5e-4,
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=500,
        min_steps=20 * env.max_steps,
        num_epoch=10,
        eps_clip=0.15,
        batch_size=512,
        max_grad_norm=1.,
        lr=3e-4,
        num_sampler_envs=12,
    )
    algo = PPO(ex_dir, env, policy, critic, **algo_hparam)

    # Save the hyper-parameters
예제 #10
0
if __name__ == '__main__':
    # Experiment (set seed before creating the modules)
    ex_dir = ask_for_experiment()

    # Environments
    hparams = load_dict_from_yaml(osp.join(ex_dir, 'hyperparams.yaml'))
    env_sim = joblib.load(osp.join(ex_dir, 'env_sim.pkl'))
    env_real = joblib.load(osp.join(ex_dir, 'env_real.pkl'))

    # Policy
    policy = to.load(osp.join(ex_dir, 'policy.pt'))

    # Critic
    valuefcn = to.load(osp.join(ex_dir, 'valuefcn.pt'))
    critic = GAE(valuefcn, **hparams['critic'])

    # Subroutine
    algo_hparam = hparams['subroutine']
    # algo_hparam.update({'num_sampler_envs': 1})
    ppo = PPO(ex_dir, env_sim, policy, critic, **algo_hparam)

    # Set the boundaries for the GP
    bounds = to.load(osp.join(ex_dir, 'bounds.pt'))

    # Algorithm
    algo = BayRn(ex_dir,
                 env_sim,
                 env_real,
                 subroutine=ppo,
                 bounds=bounds,
예제 #11
0
    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment() if args.ex_dir is None else args.ex_dir

    # Load the environment and the policy
    env_sim, policy, kwout = load_experiment(ex_dir, args)

    # Load the required data
    cands = to.load(osp.join(ex_dir, 'candidates.pt'))
    cands_values = to.load(osp.join(ex_dir, 'candidates_values.pt')).unsqueeze(1)
    bounds = to.load(osp.join(ex_dir, 'bounds.pt'))
    uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :])

    # Decide on which algorithm to use via the mode argument
    if args.mode == PPO.name:
        critic = GAE(kwout['value_fcn'], **kwout['hparams']['critic'])
        subroutine = PPO(ex_dir, env_sim, policy, critic, **kwout['hparams']['subroutine'])
    elif args.mode == PPO2.name:
        critic = GAE(kwout['value_fcn'], **kwout['hparams']['critic'])
        subroutine = PPO2(ex_dir, env_sim, policy, critic, **kwout['hparams']['subroutine'])
    elif args.mode == CEM.name:
        subroutine = CEM(ex_dir, env_sim, policy, **kwout['hparams']['subroutine'])
    elif args.mode == NES.name:
        subroutine = NES(ex_dir, env_sim, policy, **kwout['hparams']['subroutine'])
    elif args.mode == PoWER.name:
        subroutine = PoWER(ex_dir, env_sim, policy, **kwout['hparams']['subroutine'])
    else:
        raise NotImplementedError('Only PPO, PPO2, CEM, NES, and PoWER are implemented so far.')

    if args.warmstart:
        ppi = policy.param_values.data