Пример #1
0
    def __init__(self,
                 spec: EnvSpec,
                 shared_hidden_sizes: Sequence[int],
                 shared_hidden_nonlin: [Callable, Sequence[Callable]],
                 head_1_size: int = None,
                 head_2_size: int = None,
                 head_1_output_nonlin: Callable = None,
                 head_2_output_nonlin: Callable = None,
                 shared_dropout: float = 0.,
                 init_param_kwargs: dict = None,
                 use_cuda: bool = False):
        """
        Constructor

        :param spec: environment specification
        :param shared_hidden_sizes: sizes of shared hidden layer outputs. Every entry creates one shared hidden layer.
        :param shared_hidden_nonlin: nonlinearity for the shared hidden layers
        :param head_1_size: size of the fully connected layer for head 1, if `None` this is set to the action space dim
        :param head_2_size: size of the fully connected layer for head 2, if `None` this is set to the action space dim
        :param head_1_output_nonlin: nonlinearity for output layer of the first head
        :param head_2_output_nonlin: nonlinearity for output layer of the second head
        :param shared_dropout: dropout probability, default = 0 deactivates dropout
        :param init_param_kwargs: additional keyword arguments for the policy parameter initialization
        :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU
        """
        super().__init__(spec, use_cuda)

        # Create the feed-forward neural network
        self.shared = FNN(input_size=spec.obs_space.flat_dim,
                          output_size=shared_hidden_sizes[-1],
                          hidden_sizes=shared_hidden_sizes,
                          hidden_nonlin=shared_hidden_nonlin,
                          dropout=shared_dropout,
                          output_nonlin=None)

        # Create output layer
        head_1_size = spec.act_space.flat_dim if head_1_size is None else head_1_size
        head_2_size = spec.act_space.flat_dim if head_2_size is None else head_2_size
        self.head_1 = nn.Linear(shared_hidden_sizes[-1], head_1_size)
        self.head_2 = nn.Linear(shared_hidden_sizes[-1], head_2_size)
        self.head_1_output_nonlin = head_1_output_nonlin
        self.head_2_output_nonlin = head_2_output_nonlin

        # Call custom initialization function after PyTorch network parameter initialization
        init_param_kwargs = init_param_kwargs if init_param_kwargs is not None else dict(
        )
        self.init_param(None, **init_param_kwargs)
        self.to(self.device)
Пример #2
0
def test_actor_critic(ex_dir, env: SimEnv, policy: Policy, algo, algo_hparam,
                      vfcn_type, use_cuda):
    if use_cuda:
        policy._device = 'cuda'
        policy = policy.to(device='cuda')

    # Create value function
    if vfcn_type == 'fnn-plain':
        vfcn = FNN(input_size=env.obs_space.flat_dim,
                   output_size=1,
                   hidden_sizes=[16, 16],
                   hidden_nonlin=to.tanh,
                   use_cuda=use_cuda)
    else:
        vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace)
        if vfcn_type == 'fnn':
            vfcn = FNNPolicy(vf_spec,
                             hidden_sizes=[16, 16],
                             hidden_nonlin=to.tanh,
                             use_cuda=use_cuda)
        else:
            vfcn = RNNPolicy(vf_spec,
                             hidden_size=16,
                             num_recurrent_layers=1,
                             use_cuda=use_cuda)

    # Create critic
    critic_hparam = dict(
        gamma=0.98,
        lamda=0.95,
        batch_size=32,
        lr=1e-3,
        standardize_adv=False,
    )
    critic = GAE(vfcn, **critic_hparam)

    # Common hyper-parameters
    common_hparam = dict(max_iter=2, min_rollouts=3, num_workers=1)
    # Add specific hyper parameters if any
    common_hparam.update(algo_hparam)

    # Create algorithm and train
    algo = algo(ex_dir, env, policy, critic, **common_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
Пример #3
0
def test_spota_ppo(ex_dir, env: SimEnv, spota_hparam):
    # Environment and domain randomization
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperBuffer(env, randomizer)

    # Policy and subroutines
    policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh)
    vfcn = FNN(input_size=env.obs_space.flat_dim,
               output_size=1,
               hidden_sizes=[16, 16],
               hidden_nonlin=to.tanh)
    critic_hparam = dict(gamma=0.998,
                         lamda=0.95,
                         num_epoch=3,
                         batch_size=64,
                         lr=1e-3)
    critic_cand = GAE(vfcn, **critic_hparam)
    critic_refs = GAE(deepcopy(vfcn), **critic_hparam)

    subrtn_hparam_cand = dict(
        # min_rollouts=0,  # will be overwritten by SPOTA
        min_steps=0,  # will be overwritten by SPOTA
        max_iter=2,
        num_epoch=3,
        eps_clip=0.1,
        batch_size=64,
        num_workers=1,
        std_init=0.5,
        lr=1e-2)
    subrtn_hparam_cand = subrtn_hparam_cand

    sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_cand)
    sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs,
                  **subrtn_hparam_cand)

    # Create algorithm and train
    algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam)
    algo.train()
Пример #4
0
    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environment
    env_hparams = dict(dt=1/100., max_steps=500)
    env = BallOnBeamDiscSim(**env_hparams)

    # Policy
    policy_hparam = dict(
        hidden_sizes=[32, 32],
        hidden_nonlin=to.tanh
    )
    net = FNN(
        input_size=DiscreteActQValPolicy.get_qfcn_input_size(env.spec),
        output_size=DiscreteActQValPolicy.get_qfcn_output_size(),
        **policy_hparam
    )
    policy = DiscreteActQValPolicy(spec=env.spec, net=net)

    # Algorithm
    algo_hparam = dict(
        max_iter=5000,
        memory_size=10*env.max_steps,
        eps_init=0.1286,
        eps_schedule_gamma=0.9955,
        gamma=0.998,
        target_update_intvl=5,
        num_batch_updates=20,
        max_grad_norm=0.5,
        min_steps=10,
Пример #5
0
class TwoHeadedFNNPolicy(TwoHeadedPolicy):
    """ Policy architecture which has a common body and two heads that have a separate last layer """

    name: str = 'thfnn'

    def __init__(self,
                 spec: EnvSpec,
                 shared_hidden_sizes: Sequence[int],
                 shared_hidden_nonlin: [Callable, Sequence[Callable]],
                 head_1_size: int = None,
                 head_2_size: int = None,
                 head_1_output_nonlin: Callable = None,
                 head_2_output_nonlin: Callable = None,
                 shared_dropout: float = 0.,
                 init_param_kwargs: dict = None,
                 use_cuda: bool = False):
        """
        Constructor

        :param spec: environment specification
        :param shared_hidden_sizes: sizes of shared hidden layer outputs. Every entry creates one shared hidden layer.
        :param shared_hidden_nonlin: nonlinearity for the shared hidden layers
        :param head_1_size: size of the fully connected layer for head 1, if `None` this is set to the action space dim
        :param head_2_size: size of the fully connected layer for head 2, if `None` this is set to the action space dim
        :param head_1_output_nonlin: nonlinearity for output layer of the first head
        :param head_2_output_nonlin: nonlinearity for output layer of the second head
        :param shared_dropout: dropout probability, default = 0 deactivates dropout
        :param init_param_kwargs: additional keyword arguments for the policy parameter initialization
        :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU
        """
        super().__init__(spec, use_cuda)

        # Create the feed-forward neural network
        self.shared = FNN(input_size=spec.obs_space.flat_dim,
                          output_size=shared_hidden_sizes[-1],
                          hidden_sizes=shared_hidden_sizes,
                          hidden_nonlin=shared_hidden_nonlin,
                          dropout=shared_dropout,
                          output_nonlin=None)

        # Create output layer
        head_1_size = spec.act_space.flat_dim if head_1_size is None else head_1_size
        head_2_size = spec.act_space.flat_dim if head_2_size is None else head_2_size
        self.head_1 = nn.Linear(shared_hidden_sizes[-1], head_1_size)
        self.head_2 = nn.Linear(shared_hidden_sizes[-1], head_2_size)
        self.head_1_output_nonlin = head_1_output_nonlin
        self.head_2_output_nonlin = head_2_output_nonlin

        # Call custom initialization function after PyTorch network parameter initialization
        init_param_kwargs = init_param_kwargs if init_param_kwargs is not None else dict(
        )
        self.init_param(None, **init_param_kwargs)
        self.to(self.device)

    def init_param(self, init_values: to.Tensor = None, **kwargs):
        if init_values is None:
            self.shared.init_param(None, **kwargs)
            init_param(self.head_1, **kwargs)
            init_param(self.head_2, **kwargs)
        else:
            self.param_values = init_values

    def forward(self, obs: to.Tensor) -> Tuple[to.Tensor, to.Tensor]:
        obs = obs.to(self.device)

        # Get the output of the last shared layer and pass this to the two headers separately
        x = self.shared(obs)
        output_1 = self.head_1(x)
        output_2 = self.head_2(x)
        if self.head_1_output_nonlin is not None:
            output_1 = self.head_1_output_nonlin(output_1)
        if self.head_2_output_nonlin is not None:
            output_2 = self.head_2_output_nonlin(output_2)
        return output_1, output_2
Пример #6
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1 / 250., max_steps=1500)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma',
                                          [None, 0.99, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy',
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('hidden_nonlin_policy',
                                      ['to_tanh', 'to_relu'])),
    )

    # Critic
    vfcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic',
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('hidden_nonlin_critic',
                                      ['to_tanh', 'to_relu'])),
    )
    critic_hparam = dict(
        batch_size=250,
        gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic',
                                                  [True, False]),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_critic',
                                                [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam)
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=300,
        batch_size=250,
        min_steps=trial.suggest_int('num_rollouts_algo', 10, 30) *
        env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_algo',
                                                [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam)
    algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy,
               critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=1,
                                     min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Пример #7
0
    randomizer = create_empty_randomizer()
    env = ActDelayWrapper(env)
    randomizer.add_domain_params(
        UniformDomainParam(name='act_delay',
                           mean=2,
                           halfspan=2,
                           clip_lo=0,
                           roundint=True))
    env = DomainRandWrapperLive(env, randomizer)

    # Policy
    policy_hparam = dict(
        obs_layer=FNN(input_size=env.obs_space.flat_dim,
                      output_size=env.act_space.flat_dim,
                      hidden_sizes=[32, 32],
                      hidden_nonlin=to.tanh,
                      dropout=0.),
        tau_init=10.,
        tau_learnable=True,
        kappa_init=0.02,
        kappa_learnable=True,
        activation_nonlin=to.sigmoid,
        potentials_dyn_fcn=pd_cubic,
    )
    policy = ADNPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=5000,
        pop_size=None,
Пример #8
0
def test_snapshots_notmeta(ex_dir, env: SimEnv, policy, algo_class,
                           algo_hparam):
    # Collect hyper-parameters, create algorithm, and train
    common_hparam = dict(max_iter=1, num_workers=1)
    common_hparam.update(algo_hparam)

    if issubclass(algo_class, ActorCritic):
        common_hparam.update(
            min_rollouts=3,
            critic=GAE(
                vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                               hidden_sizes=[16, 16],
                               hidden_nonlin=to.tanh)))
    elif issubclass(algo_class, ParameterExploring):
        common_hparam.update(num_rollouts=1)
    elif issubclass(algo_class, (DQL, SAC)):
        common_hparam.update(memory_size=1000,
                             num_batch_updates=2,
                             gamma=0.99,
                             min_rollouts=1)
        fnn_hparam = dict(hidden_sizes=[8, 8], hidden_nonlin=to.tanh)
        if issubclass(algo_class, DQL):
            # Override the setting
            env = BallOnBeamDiscSim(env.dt, env.max_steps)
            net = FNN(input_size=DiscreteActQValPolicy.get_qfcn_input_size(
                env.spec),
                      output_size=DiscreteActQValPolicy.get_qfcn_output_size(),
                      **fnn_hparam)
            policy = DiscreteActQValPolicy(spec=env.spec, net=net)
        else:
            # Override the setting
            env = ActNormWrapper(env)
            policy = TwoHeadedGRUPolicy(env.spec,
                                        shared_hidden_size=8,
                                        shared_num_recurrent_layers=1)
            obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
            common_hparam.update(qfcn_1=FNNPolicy(
                spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam))
            common_hparam.update(qfcn_2=FNNPolicy(
                spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam))
    else:
        raise NotImplementedError

    # Simulate training
    algo = algo_class(ex_dir, env, policy, **common_hparam)
    algo.policy.param_values += to.tensor([42.])
    if isinstance(algo, ActorCritic):
        algo.critic.vfcn.param_values += to.tensor([42.])

    # Save and load
    algo.save_snapshot(meta_info=None)
    algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir)
    assert isinstance(algo_loaded, Algorithm)
    policy_loaded = algo_loaded.policy
    if isinstance(algo, ActorCritic):
        critic_loaded = algo_loaded.critic

    # Check
    assert all(algo.policy.param_values == policy_loaded.param_values)
    if isinstance(algo, ActorCritic):
        assert all(
            algo.critic.vfcn.param_values == critic_loaded.vfcn.param_values)

    # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading.
    env, policy, extra = load_experiment(ex_dir)
    assert isinstance(env, Env)
    assert isinstance(policy, Policy)
    assert isinstance(extra, dict)