Python ParallelSampler примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyrado.sampling.parallel_sampler

Класс/Тип: ParallelSampler

Примеров на hotexamples.com: 17

Python ParallelSampler - 17 примеров найдено. Это лучшие примеры Python кода для pyrado.sampling.parallel_sampler.ParallelSampler, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ParallelSampler(14)

sample(9)

reinit(2)

Основные методы

ParallelSampler (14)

sample (9)

reinit (2)

Пример #1

Показать файл

def test_cuda_sampling_w_dr(default_bob, bob_pert):
    # Add randomizer
    env = DomainRandWrapperLive(default_bob, bob_pert)

    # Use a simple policy
    policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh, use_cuda=True)

    # Create the sampler
    sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=10)

    samples = sampler.sample()
    assert samples is not None

Пример #2

Показать файл

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_sampler_envs: int = 4,
                 logger: StepLogger = None,
                 sampler: SamplerBase = None,
                 ball_z_dim_mismatch: bool = True):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy which this algorithm is creating
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim,
                                    set to True if the controller does not have the z component (relative position)
                                    of the ball in the state vector, i.e. state is 14-dim instead of 16-dim
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, LinearPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=LinearPolicy)

        # Call Algorithm's constructor
        super().__init__(save_dir, 1, policy, logger)

        # Store the inputs
        self._env = env
        self.ball_z_dim_mismatch = ball_z_dim_mismatch

        # Initialize variables for checking and evaluating
        if sampler is None:
            sampler = ParallelSampler(env,
                                      self._policy,
                                      num_envs=num_sampler_envs,
                                      min_steps=min_steps,
                                      min_rollouts=min_rollouts)
        self.sampler = sampler
        self.eigvals = np.array([pyrado.inf])  # initialize with sth positive

Пример #3

Показать файл

def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=100,
        reward_multiplier=1,
        logger=None
    )
    policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh)
    dr = get_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(format='dict', dtype='numpy')
    reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000)
    random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000)

    losses = []
    for i in range(50):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]

Пример #4

Показать файл

def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1/250., max_steps=1500)
    env = ActNormWrapper(env)

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )

    # Critic
    value_fcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    critic_hparam = dict(
        gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        batch_size=100,
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=500,
        min_steps=25*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        batch_size=100,
        std_init=0.9,
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret

Пример #5

Показать файл

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_epoch: int = 3,
                 eps_clip: float = 0.1,
                 batch_size: int = 64,
                 std_init: float = 1.0,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = None,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epoch: number of iterations over all gathered samples during one policy update
        :param eps_clip: max/min probability ratio, see [1]
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created

        .. note::
            The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler
            schedules the maximum learning rate.
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        assert isinstance(policy, Policy)

        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.num_epoch = num_epoch
        self.eps_clip = eps_clip
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self.log_loss = True
        self._expl_strat = NormalActNoiseExplStrat(self._policy,
                                                   std_init=std_init)
        self.sampler = ParallelSampler(env,
                                       self._expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=min_steps,
                                       min_rollouts=min_rollouts)
        self.optim = to.optim.Adam(
            [{
                'params': self._expl_strat.policy.parameters()
            }, {
                'params': self._expl_strat.noise.parameters()
            }],
            lr=lr,
            eps=1e-5)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)

Пример #6

Показать файл

def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env = QQubeSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            'shared_hidden_sizes_policy',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('shared_hidden_nonlin_policy',
                                      ['to_tanh', 'to_relu'])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    q_fcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical(
            'hidden_sizes_critic',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('hidden_nonlin_critic',
                                      ['to_tanh', 'to_relu'])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)
    q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            'min_steps_algo', [1]),  # , 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform('memory_size_algo',
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform('tau_algo', 0.99, 1.),
        alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9),
        learn_alpha=trial.suggest_categorical('learn_alpha_algo',
                                              [True, False]),
        standardize_rew=trial.suggest_categorical('standardize_rew_algo',
                                                  [False]),
        gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.),
        target_update_intvl=trial.suggest_categorical(
            'target_update_intvl_algo', [1, 5]),
        num_batch_updates=trial.suggest_categorical('num_batch_updates_algo',
                                                    [1, 5]),
        batch_size=trial.suggest_categorical('batch_size_algo',
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(ex_dir, f'trial_{trial.number}'))
    algo = SAC(ex_dir,
               env,
               policy,
               q_fcn_1,
               q_fcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(
        env, policy, num_envs=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret

Пример #7

Показать файл

    def eval_policy(save_dir: [str, None],
                    env_real: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy, montecarlo_estimator: bool, prefix: str,
                    num_rollouts: int) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env_real: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param montecarlo_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :return: estimated return in the target domain
        """
        if isinstance(env_real, RealEnv):
            input('Evaluating in the target domain. Hit any key to continue.')
        if save_dir is not None:
            print_cbt(f'Evaluating {prefix}_policy on the target system ...',
                      'c',
                      bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(env_real, RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env_real,
                                       policy,
                                       eval=True,
                                       no_close=False).undiscounted_return()
        elif isinstance(env_real, (SimEnv, MetaDomainRandWrapper)):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelSampler(env_real,
                                      policy,
                                      num_envs=1,
                                      min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(
                given=env_real,
                expected_type=[RealEnv, SimEnv, MetaDomainRandWrapper])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('target domain performance', bright=True)
            print(
                tabulate([['mean return',
                           to.mean(rets_real).item()],
                          ['std return', to.std(rets_real)],
                          ['min return', to.min(rets_real)],
                          ['max return', to.max(rets_real)]]))

        if montecarlo_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(
                bootstrap_ci(rets_real.numpy(),
                             np.mean,
                             num_reps=1000,
                             alpha=0.05,
                             ci_sides=1,
                             studentized=False)[1])

Пример #8

Показать файл

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 particle_hparam: dict,
                 max_iter: int,
                 num_particles: int,
                 temperature: float,
                 lr: float,
                 horizon: int,
                 std_init: float = 1.0,
                 min_rollouts: int = None,
                 min_steps: int = 10000,
                 num_sampler_envs: int = 4,
                 serial: bool = True,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param particle_hparam: hyper-parameters for particle template construction
        :param max_iter: number of iterations
        :param num_particles: number of distinct particles
        :param temperature: the temperature of the SVGD determines how jointly the training takes place
        :param lr: the learning rate for the update of the particles
        :param horizon: horizon for each particle
        :param std_init: initial standard deviation for the exploration
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(particle_hparam, dict):
            raise pyrado.TypeErr(given=particle_hparam, expected_type=dict)
        if not all([
                key in particle_hparam
                for key in ['actor', 'value_fcn', 'critic']
        ]):
            raise AttributeError

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy=None, logger=logger)

        # Store the inputs
        self._env = env
        self.num_particles = num_particles
        self.horizon = horizon  # TODO @Robin: where is the horizon used?!
        self.lr = lr
        self.temperature = temperature
        self.serial = serial

        # Prepare placeholders for particles
        self.particles = [None] * num_particles
        self.expl_strats = [None] * num_particles
        self.optimizers = [None] * num_particles
        self.fixed_particles = [None] * num_particles
        self.fixed_expl_strats = [None] * num_particles
        self.samplers = [None] * num_particles
        self.count = 0
        self.updatecount = 0

        # Particle factory
        actor = FNNPolicy(spec=env.spec, **particle_hparam['actor'])
        value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                              **particle_hparam['value_fcn'])
        critic = GAE(value_fcn, **particle_hparam['critic'])
        particle = SVPGParticle(env.spec, actor, critic)

        for i in range(self.num_particles):
            self.particles[i] = deepcopy(particle)
            self.particles[i].init_param()
            self.expl_strats[i] = NormalActNoiseExplStrat(
                self.particles[i].actor, std_init)
            self.optimizers[i] = to.optim.Adam(
                self.expl_strats[i].parameters(), lr=self.lr)
            self.fixed_particles[i] = deepcopy(self.particles[i])
            self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i])

            if self.serial:
                self.samplers[i] = ParallelSampler(env,
                                                   self.expl_strats[i],
                                                   num_sampler_envs,
                                                   min_rollouts=min_rollouts,
                                                   min_steps=min_steps)

Пример #9

Показать файл

    def __init__(self,
                 save_dir: str,
                 env: [SimEnv, StateAugmentationWrapper],
                 subroutine: Algorithm,
                 policy: Policy,
                 expl_strat: StochasticActionExplStrat,
                 max_iter: int,
                 num_rollouts: int = None,
                 steps_num: int = None,
                 apply_dynamics_noise: bool = False,
                 dyn_eps: float = 0.01,
                 dyn_phi: float = 0.1,
                 halfspan: float = 0.25,
                 apply_proccess_noise: bool = False,
                 proc_eps: float = 0.01,
                 proc_phi: float = 0.05,
                 apply_observation_noise: bool = False,
                 obs_eps: float = 0.01,
                 obs_phi: float = 0.05,
                 torch_observation: bool = True,
                 base_seed: int = None,
                 num_sampler_envs: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment in which the agent should be trained
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param policy: policy to be updated
        :param expl_strat: the exploration strategy
        :param max_iter: the maximum number of iterations
        :param num_rollouts: the number of rollouts to be performed for each update step
        :param steps_num: the number of steps to be performed for each update step
        :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied
        :param dyn_eps: the intensity of generated dynamics noise
        :param dyn_phi: the probability of applying dynamics noise
        :param halfspan: the halfspan of the uniform random distribution used to sample
        :param apply_proccess_noise: whether adversarially generated process noise should be applied
        :param proc_eps: the intensity of generated process noise
        :param proc_phi: the probability of applying process noise
        :param apply_observation_noise: whether adversarially generated observation noise should be applied
        :param obs_eps: the intensity of generated observation noise
        :param obs_phi: the probability of applying observation noise
        :param torch_observation: a function to provide a differentiable observation
        :param base_seed: the random seed
        :param num_sampler_envs: number of environments for parallel sampling
        :param logger: the logger
        """
        assert isinstance(subroutine, Algorithm)
        assert isinstance(max_iter, int) and max_iter > 0

        super().__init__(save_dir, max_iter, policy, logger)
        # Get the randomized environment (recommended to make it the most outer one in the chain)

        # Initialize adversarial wrappers
        if apply_dynamics_noise:
            assert isinstance(env, StateAugmentationWrapper)
            env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps,
                                             dyn_phi, halfspan)
        if apply_proccess_noise:
            env = AdversarialStateWrapper(env,
                                          self.policy,
                                          proc_eps,
                                          proc_phi,
                                          torch_observation=torch_observation)
        if apply_observation_noise:
            env = AdversarialObservationWrapper(env, self.policy, obs_eps,
                                                obs_phi)

        self.num_rollouts = num_rollouts
        self.sampler = ParallelSampler(env,
                                       expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=steps_num,
                                       min_rollouts=num_rollouts,
                                       seed=base_seed)
        self._subroutine = subroutine

Пример #10

Показать файл

class ARPL(Algorithm):
    """
    Adversarially Robust Policy Learning (ARPL)

    .. seealso::
        A. Mandlekar, Y. Zhu, A. Garg, L. Fei-Fei, S. Savarese, "Adversarially Robust Policy Learning:
        Active Construction of Physically-Plausible Perturbations", IROS, 2017
    """

    name: str = 'arpl'

    def __init__(self,
                 save_dir: str,
                 env: [SimEnv, StateAugmentationWrapper],
                 subroutine: Algorithm,
                 policy: Policy,
                 expl_strat: StochasticActionExplStrat,
                 max_iter: int,
                 num_rollouts: int = None,
                 steps_num: int = None,
                 apply_dynamics_noise: bool = False,
                 dyn_eps: float = 0.01,
                 dyn_phi: float = 0.1,
                 halfspan: float = 0.25,
                 apply_proccess_noise: bool = False,
                 proc_eps: float = 0.01,
                 proc_phi: float = 0.05,
                 apply_observation_noise: bool = False,
                 obs_eps: float = 0.01,
                 obs_phi: float = 0.05,
                 torch_observation: bool = True,
                 base_seed: int = None,
                 num_sampler_envs: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment in which the agent should be trained
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param policy: policy to be updated
        :param expl_strat: the exploration strategy
        :param max_iter: the maximum number of iterations
        :param num_rollouts: the number of rollouts to be performed for each update step
        :param steps_num: the number of steps to be performed for each update step
        :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied
        :param dyn_eps: the intensity of generated dynamics noise
        :param dyn_phi: the probability of applying dynamics noise
        :param halfspan: the halfspan of the uniform random distribution used to sample
        :param apply_proccess_noise: whether adversarially generated process noise should be applied
        :param proc_eps: the intensity of generated process noise
        :param proc_phi: the probability of applying process noise
        :param apply_observation_noise: whether adversarially generated observation noise should be applied
        :param obs_eps: the intensity of generated observation noise
        :param obs_phi: the probability of applying observation noise
        :param torch_observation: a function to provide a differentiable observation
        :param base_seed: the random seed
        :param num_sampler_envs: number of environments for parallel sampling
        :param logger: the logger
        """
        assert isinstance(subroutine, Algorithm)
        assert isinstance(max_iter, int) and max_iter > 0

        super().__init__(save_dir, max_iter, policy, logger)
        # Get the randomized environment (recommended to make it the most outer one in the chain)

        # Initialize adversarial wrappers
        if apply_dynamics_noise:
            assert isinstance(env, StateAugmentationWrapper)
            env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps,
                                             dyn_phi, halfspan)
        if apply_proccess_noise:
            env = AdversarialStateWrapper(env,
                                          self.policy,
                                          proc_eps,
                                          proc_phi,
                                          torch_observation=torch_observation)
        if apply_observation_noise:
            env = AdversarialObservationWrapper(env, self.policy, obs_eps,
                                                obs_phi)

        self.num_rollouts = num_rollouts
        self.sampler = ParallelSampler(env,
                                       expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=steps_num,
                                       min_rollouts=num_rollouts,
                                       seed=base_seed)
        self._subroutine = subroutine

    def step(self, snapshot_mode: str, meta_info: dict = None):
        rollouts = self.sampler.sample()
        rets = [ro.undiscounted_return() for ro in rollouts]
        ret_avg = np.mean(rets)
        ret_med = np.median(rets)
        ret_std = np.std(rets)
        self.logger.add_value('num rollouts', len(rollouts))
        self.logger.add_value('avg rollout len',
                              np.mean([ro.length for ro in rollouts]))
        self.logger.add_value('avg return', ret_avg)
        self.logger.add_value('median return', ret_med)
        self.logger.add_value('std return', ret_std)

        # Sub-routine
        self._subroutine.update(rollouts)
        self._subroutine.logger.record_step()
        self._subroutine.make_snapshot(snapshot_mode, ret_avg.item())

Пример #11

Показать файл

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: TwoHeadedPolicy,
                 q_fcn_1: Policy,
                 q_fcn_2: Policy,
                 memory_size: int,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 tau: float = 0.995,
                 alpha_init: float = 0.2,
                 learn_alpha: bool = True,
                 target_update_intvl: int = 1,
                 standardize_rew: bool = True,
                 batch_size: int = 500,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 5.,
                 lr: float = 3e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param q_fcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param q_fcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak
                    update, between 0 and 1
        :param alpha_init: initial weighting factor of the entropy term in the loss function
        :param learn_alpha: adapt the weighting factor of the entropy term
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param standardize_rew: bool to flag if the rewards should be standardized
        :param batch_size: number of samples per policy update batch
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step
                             per `update()` call
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if typed_env(env, ActNormWrapper) is None:
            raise pyrado.TypeErr(
                msg='SAC required an environment wrapped by an ActNormWrapper!'
            )
        if not isinstance(q_fcn_1, Policy):
            raise pyrado.TypeErr(given=q_fcn_1, expected_type=Policy)
        if not isinstance(q_fcn_2, Policy):
            raise pyrado.TypeErr(given=q_fcn_2, expected_type=Policy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(
                CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.q_fcn_1 = q_fcn_1
        self.q_fcn_2 = q_fcn_2
        self.q_targ_1 = deepcopy(self.q_fcn_1)
        self.q_targ_2 = deepcopy(self.q_fcn_2)
        self.q_targ_1.eval()
        self.q_targ_2.eval()
        self.gamma = gamma
        self.tau = tau
        self.learn_alpha = learn_alpha
        self.target_update_intvl = target_update_intvl
        self.standardize_rew = standardize_rew
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._memory = ReplayMemory(memory_size)
        if policy.is_recurrent:
            init_expl_policy = RecurrentDummyPolicy(env.spec,
                                                    policy.hidden_size)
        else:
            init_expl_policy = DummyPolicy(env.spec)
        self.sampler_init = ParallelSampler(
            env,
            init_expl_policy,  # samples uniformly random from the action space
            num_envs=num_sampler_envs,
            min_steps=memory_size,
        )
        self._expl_strat = SACExplStrat(
            self._policy,
            std_init=1.)  # std_init will be overwritten by 2nd policy head
        self.sampler = ParallelSampler(
            env,
            self._expl_strat,
            num_envs=1,
            min_steps=min_steps,  # in [2] this would be 1
            min_rollouts=min_rollouts  # in [2] this would be None
        )
        self.sampler_eval = ParallelSampler(env,
                                            self._policy,
                                            num_envs=num_sampler_envs,
                                            min_steps=100 * env.max_steps,
                                            min_rollouts=None)
        self._optim_policy = to.optim.Adam([{
            'params': self._policy.parameters()
        }],
                                           lr=lr)
        self._optim_q_fcn_1 = to.optim.Adam(
            [{
                'params': self.q_fcn_1.parameters()
            }], lr=lr)
        self._optim_q_fcn_2 = to.optim.Adam(
            [{
                'params': self.q_fcn_2.parameters()
            }], lr=lr)
        log_alpha_init = to.log(
            to.tensor(alpha_init, dtype=to.get_default_dtype()))
        if learn_alpha:
            # Automatic entropy tuning
            self._log_alpha = nn.Parameter(log_alpha_init, requires_grad=True)
            self._alpha_optim = to.optim.Adam([{
                'params': self._log_alpha
            }],
                                              lr=lr)
            self.target_entropy = -to.prod(to.tensor(env.act_space.shape))
        else:
            self._log_alpha = log_alpha_init

        self._lr_scheduler_policy = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler_policy = lr_scheduler(self._optim_policy,
                                                     **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_1 = lr_scheduler(self._optim_q_fcn_1,
                                                      **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_2 = lr_scheduler(self._optim_q_fcn_2,
                                                      **lr_scheduler_hparam)

Пример #12

Показать файл

"""
Script to sample some rollouts using the ParallelSampler
"""
from tabulate import tabulate

from pyrado.environment_wrappers.action_normalization import ActNormWrapper
from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim
from pyrado.policies.features import FeatureStack, identity_feat, squared_feat
from pyrado.policies.linear import LinearPolicy
from pyrado.sampling.parallel_sampler import ParallelSampler

if __name__ == '__main__':
    # Set up environment
    env = BallOnBeamSim(dt=0.02, max_steps=500)
    env = ActNormWrapper(env)

    # Set up policy
    feats = FeatureStack([identity_feat, squared_feat])
    policy = LinearPolicy(env.spec, feats)

    # Set up sampler
    sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=2000)

    # Sample and print
    ros = sampler.sample()
    print(
        tabulate({
            'StepSequence count': len(ros),
            'Step count': sum(map(len, ros)),
        }.items()))

Пример #13

Показать файл

class SAC(Algorithm):
    """
    Soft Actor-Critic (SAC) variant with stochastic policy and two Q-functions and two Q-targets (no V-function)

    .. seealso::
        [1] T. Haarnoja, A. Zhou, P. Abbeel, S. Levine, "Soft Actor-Critic: Off-Policy Maximum Entropy Deep
        Reinforcement Learning with a Stochastic Actor", ICML, 2018

        [2] This implementation was inspired by https://github.com/pranz24/pytorch-soft-actor-critic
            which is seems to be based on https://github.com/vitchyr/rlkit
    """

    name: str = 'sac'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: TwoHeadedPolicy,
                 q_fcn_1: Policy,
                 q_fcn_2: Policy,
                 memory_size: int,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 tau: float = 0.995,
                 alpha_init: float = 0.2,
                 learn_alpha: bool = True,
                 target_update_intvl: int = 1,
                 standardize_rew: bool = True,
                 batch_size: int = 500,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 5.,
                 lr: float = 3e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param q_fcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param q_fcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak
                    update, between 0 and 1
        :param alpha_init: initial weighting factor of the entropy term in the loss function
        :param learn_alpha: adapt the weighting factor of the entropy term
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param standardize_rew: bool to flag if the rewards should be standardized
        :param batch_size: number of samples per policy update batch
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step
                             per `update()` call
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if typed_env(env, ActNormWrapper) is None:
            raise pyrado.TypeErr(
                msg='SAC required an environment wrapped by an ActNormWrapper!'
            )
        if not isinstance(q_fcn_1, Policy):
            raise pyrado.TypeErr(given=q_fcn_1, expected_type=Policy)
        if not isinstance(q_fcn_2, Policy):
            raise pyrado.TypeErr(given=q_fcn_2, expected_type=Policy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(
                CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.q_fcn_1 = q_fcn_1
        self.q_fcn_2 = q_fcn_2
        self.q_targ_1 = deepcopy(self.q_fcn_1)
        self.q_targ_2 = deepcopy(self.q_fcn_2)
        self.q_targ_1.eval()
        self.q_targ_2.eval()
        self.gamma = gamma
        self.tau = tau
        self.learn_alpha = learn_alpha
        self.target_update_intvl = target_update_intvl
        self.standardize_rew = standardize_rew
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._memory = ReplayMemory(memory_size)
        if policy.is_recurrent:
            init_expl_policy = RecurrentDummyPolicy(env.spec,
                                                    policy.hidden_size)
        else:
            init_expl_policy = DummyPolicy(env.spec)
        self.sampler_init = ParallelSampler(
            env,
            init_expl_policy,  # samples uniformly random from the action space
            num_envs=num_sampler_envs,
            min_steps=memory_size,
        )
        self._expl_strat = SACExplStrat(
            self._policy,
            std_init=1.)  # std_init will be overwritten by 2nd policy head
        self.sampler = ParallelSampler(
            env,
            self._expl_strat,
            num_envs=1,
            min_steps=min_steps,  # in [2] this would be 1
            min_rollouts=min_rollouts  # in [2] this would be None
        )
        self.sampler_eval = ParallelSampler(env,
                                            self._policy,
                                            num_envs=num_sampler_envs,
                                            min_steps=100 * env.max_steps,
                                            min_rollouts=None)
        self._optim_policy = to.optim.Adam([{
            'params': self._policy.parameters()
        }],
                                           lr=lr)
        self._optim_q_fcn_1 = to.optim.Adam(
            [{
                'params': self.q_fcn_1.parameters()
            }], lr=lr)
        self._optim_q_fcn_2 = to.optim.Adam(
            [{
                'params': self.q_fcn_2.parameters()
            }], lr=lr)
        log_alpha_init = to.log(
            to.tensor(alpha_init, dtype=to.get_default_dtype()))
        if learn_alpha:
            # Automatic entropy tuning
            self._log_alpha = nn.Parameter(log_alpha_init, requires_grad=True)
            self._alpha_optim = to.optim.Adam([{
                'params': self._log_alpha
            }],
                                              lr=lr)
            self.target_entropy = -to.prod(to.tensor(env.act_space.shape))
        else:
            self._log_alpha = log_alpha_init

        self._lr_scheduler_policy = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler_policy = lr_scheduler(self._optim_policy,
                                                     **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_1 = lr_scheduler(self._optim_q_fcn_1,
                                                      **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_2 = lr_scheduler(self._optim_q_fcn_2,
                                                      **lr_scheduler_hparam)

    @property
    def expl_strat(self) -> SACExplStrat:
        return self._expl_strat

    @property
    def memory(self) -> ReplayMemory:
        """ Get the replay memory. """
        return self._memory

    @property
    def alpha(self) -> to.Tensor:
        """ Get the detached entropy coefficient. """
        return to.exp(self._log_alpha.detach())

    def step(self, snapshot_mode: str, meta_info: dict = None):
        if self._memory.isempty:
            # Warm-up phase
            print_cbt(
                'Collecting samples until replay memory contains if full.',
                'w')
            # Sample steps and store them in the replay memory
            ros = self.sampler_init.sample()
            self._memory.push(ros)
        else:
            # Sample steps and store them in the replay memory
            ros = self.sampler.sample()
            self._memory.push(ros)

        # Log return-based metrics
        if self._curr_iter % self.logger.print_interval == 0:
            ros = self.sampler_eval.sample()
            rets = [ro.undiscounted_return() for ro in ros]
            ret_max = np.max(rets)
            ret_med = np.median(rets)
            ret_avg = np.mean(rets)
            ret_min = np.min(rets)
            ret_std = np.std(rets)
        else:
            ret_max, ret_med, ret_avg, ret_min, ret_std = 5 * [
                -pyrado.inf
            ]  # dummy values
        self.logger.add_value('max return', np.round(ret_max, 4))
        self.logger.add_value('median return', np.round(ret_med, 4))
        self.logger.add_value('avg return', np.round(ret_avg, 4))
        self.logger.add_value('min return', np.round(ret_min, 4))
        self.logger.add_value('std return', np.round(ret_std, 4))
        self.logger.add_value('avg rollout length',
                              np.round(np.mean([ro.length for ro in ros]), 2))
        self.logger.add_value('num rollouts', len(ros))
        self.logger.add_value('avg memory reward',
                              np.round(self._memory.avg_reward(), 4))

        # Use data in the memory to update the policy and the Q-functions
        self.update()

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)

    @staticmethod
    def soft_update(target: nn.Module, source: nn.Module, tau: float = 0.995):
        """
        Moving average update, a.k.a. Polyak update.
        Modifies the input argument `target`.

        :param target: PyTroch module with parameters to be updated
        :param source: PyTroch module with parameters to update to
        :param tau: interpolation factor for averaging, between 0 and 1
        """
        if not 0 < tau < 1:
            raise pyrado.ValueErr(given=tau,
                                  g_constraint='0',
                                  l_constraint='1')

        for targ_param, src_param in zip(target.parameters(),
                                         source.parameters()):
            targ_param.data = targ_param.data * tau + src_param.data * (1. -
                                                                        tau)

    def update(self):
        """ Update the policy's and Q-functions' parameters on transitions sampled from the replay memory. """
        # Containers for logging
        policy_losses = to.zeros(self.num_batch_updates)
        expl_strat_stds = to.zeros(self.num_batch_updates)
        q_fcn_1_losses = to.zeros(self.num_batch_updates)
        q_fcn_2_losses = to.zeros(self.num_batch_updates)
        policy_grad_norm = to.zeros(self.num_batch_updates)
        q_fcn_1_grad_norm = to.zeros(self.num_batch_updates)
        q_fcn_2_grad_norm = to.zeros(self.num_batch_updates)

        for b in tqdm(range(self.num_batch_updates),
                      total=self.num_batch_updates,
                      desc=f'Updating',
                      unit='batches',
                      file=sys.stdout,
                      leave=False):

            # Sample steps and the associated next step from the replay memory
            steps, next_steps = self._memory.sample(self.batch_size)
            steps.torch(data_type=to.get_default_dtype())
            next_steps.torch(data_type=to.get_default_dtype())

            # Standardize rewards
            if self.standardize_rew:
                rewards = standardize(steps.rewards).unsqueeze(1)
            else:
                rewards = steps.rewards.unsqueeze(1)
            rew_scale = 1.
            rewards *= rew_scale

            with to.no_grad():
                # Create masks for the non-final observations
                not_done = to.tensor(1. - steps.done,
                                     dtype=to.get_default_dtype()).unsqueeze(1)

                # Compute the (next)state-(next)action values Q(s',a') from the target networks
                if self.policy.is_recurrent:
                    next_act_expl, next_log_probs, _ = self._expl_strat(
                        next_steps.observations, next_steps.hidden_states)
                else:
                    next_act_expl, next_log_probs = self._expl_strat(
                        next_steps.observations)
                next_q_val_target_1 = self.q_targ_1(
                    to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_2 = self.q_targ_2(
                    to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_min = to.min(
                    next_q_val_target_1,
                    next_q_val_target_2) - self.alpha * next_log_probs
                next_q_val = rewards + not_done * self.gamma * next_q_val_target_min

            # Compute the two Q-function losses
            # E_{(s_t, a_t) ~ D} [1/2 * (Q_i(s_t, a_t) - r_t - gamma * E_{s_{t+1} ~ p} [V(s_{t+1})] )^2]
            q_val_1 = self.q_fcn_1(
                to.cat([steps.observations, steps.actions], dim=1))
            q_val_2 = self.q_fcn_2(
                to.cat([steps.observations, steps.actions], dim=1))
            q_1_loss = nn.functional.mse_loss(q_val_1, next_q_val)
            q_2_loss = nn.functional.mse_loss(q_val_2, next_q_val)
            q_fcn_1_losses[b] = q_1_loss.data
            q_fcn_2_losses[b] = q_2_loss.data

            # Compute the policy loss
            # E_{s_t ~ D, eps_t ~ N} [log( pi( f(eps_t; s_t) ) ) - Q(s_t, f(eps_t; s_t))]
            if self.policy.is_recurrent:
                act_expl, log_probs, _ = self._expl_strat(
                    steps.observations, steps.hidden_states)
            else:
                act_expl, log_probs = self._expl_strat(steps.observations)
            q1_pi = self.q_fcn_1(to.cat([steps.observations, act_expl], dim=1))
            q2_pi = self.q_fcn_2(to.cat([steps.observations, act_expl], dim=1))
            min_q_pi = to.min(q1_pi, q2_pi)
            policy_loss = to.mean(self.alpha * log_probs - min_q_pi)
            policy_losses[b] = policy_loss.data
            expl_strat_stds[b] = to.mean(self._expl_strat.std.data)

            # Do one optimization step for each optimizer, and clip the gradients if desired
            # Q-fcn 1
            self._optim_q_fcn_1.zero_grad()
            q_1_loss.backward()
            q_fcn_1_grad_norm[b] = self.clip_grad(self.q_fcn_1, None)
            self._optim_q_fcn_1.step()
            # Q-fcn 2
            self._optim_q_fcn_2.zero_grad()
            q_2_loss.backward()
            q_fcn_2_grad_norm[b] = self.clip_grad(self.q_fcn_2, None)
            self._optim_q_fcn_2.step()
            # Policy
            self._optim_policy.zero_grad()
            policy_loss.backward()
            policy_grad_norm[b] = self.clip_grad(self._expl_strat.policy,
                                                 self.max_grad_norm)
            self._optim_policy.step()

            if self.learn_alpha:
                # Compute entropy coefficient loss
                alpha_loss = -to.mean(
                    self._log_alpha *
                    (log_probs.detach() + self.target_entropy))
                # Do one optimizer step for the entropy coefficient optimizer
                self._alpha_optim.zero_grad()
                alpha_loss.backward()
                self._alpha_optim.step()

            # Soft-update the target networks
            if (self._curr_iter * self.num_batch_updates +
                    b) % self.target_update_intvl == 0:
                SAC.soft_update(self.q_targ_1, self.q_fcn_1, self.tau)
                SAC.soft_update(self.q_targ_2, self.q_fcn_2, self.tau)

        # Update the learning rate if the schedulers have been specified
        if self._lr_scheduler_policy is not None:
            self._lr_scheduler_policy.step()
            self._lr_scheduler_q_fcn_1.step()
            self._lr_scheduler_q_fcn_2.step()

        # Logging
        self.logger.add_value('Q1 loss', to.mean(q_fcn_1_losses).item())
        self.logger.add_value('Q2 loss', to.mean(q_fcn_2_losses).item())
        self.logger.add_value('policy loss', to.mean(policy_losses).item())
        self.logger.add_value('avg policy grad norm',
                              to.mean(policy_grad_norm).item())
        self.logger.add_value('avg expl strat std',
                              to.mean(expl_strat_stds).item())
        self.logger.add_value('alpha', self.alpha.item())
        if self._lr_scheduler_policy is not None:
            self.logger.add_value('learning rate',
                                  self._lr_scheduler_policy.get_lr())

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This instance is not a subroutine of a meta-algorithm
            joblib.dump(self._env, osp.join(self._save_dir, 'env.pkl'))
            to.save(self.q_targ_1, osp.join(self._save_dir, 'target1.pt'))
            to.save(self.q_targ_2, osp.join(self._save_dir, 'target2.pt'))
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                to.save(
                    self.q_targ_1,
                    osp.join(
                        self._save_dir,
                        f"{meta_info['prefix']}_target1_{meta_info['suffix']}.pt"
                    ))
                to.save(
                    self.q_targ_2,
                    osp.join(
                        self._save_dir,
                        f"{meta_info['prefix']}_target2_{meta_info['suffix']}.pt"
                    ))
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                to.save(
                    self.q_targ_1,
                    osp.join(self._save_dir,
                             f"{meta_info['prefix']}_target1.pt"))
                to.save(
                    self.q_targ_2,
                    osp.join(self._save_dir,
                             f"{meta_info['prefix']}_target2.pt"))
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                to.save(
                    self.q_targ_1,
                    osp.join(self._save_dir,
                             f"target1_{meta_info['suffix']}.pt"))
                to.save(
                    self.q_targ_2,
                    osp.join(self._save_dir,
                             f"target2_{meta_info['suffix']}.pt"))
            else:
                raise NotImplementedError

    def load_snapshot(self, load_dir: str = None, meta_info: dict = None):
        # Get the directory to load from
        ld = load_dir if load_dir is not None else self._save_dir
        super().load_snapshot(ld, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of a meta-algorithm
            self._env = joblib.load(osp.join(ld, 'env.pkl'))
            self.q_targ_1.load_state_dict(
                to.load(osp.join(ld, 'target1.pt')).state_dict())
            self.q_targ_2.load_state_dict(
                to.load(osp.join(ld, 'target2.pt')).state_dict())
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                self.q_targ_1.load_state_dict(
                    to.load(
                        osp.join(
                            ld,
                            f"{meta_info['prefix']}_target1_{meta_info['suffix']}.pt"
                        )).state_dict())
                self.q_targ_2.load_state_dict(
                    to.load(
                        osp.join(
                            ld,
                            f"{meta_info['prefix']}_target2_{meta_info['suffix']}.pt"
                        )).state_dict())
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                self.q_targ_1.load_state_dict(
                    to.load(osp.join(
                        ld, f"{meta_info['prefix']}_target1.pt")).state_dict())
                self.q_targ_2.load_state_dict(
                    to.load(osp.join(
                        ld, f"{meta_info['prefix']}_target2.pt")).state_dict())
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                self.q_targ_1.load_state_dict(
                    to.load(osp.join(
                        ld, f"target1_{meta_info['suffix']}.pt")).state_dict())
                self.q_targ_2.load_state_dict(
                    to.load(osp.join(
                        ld, f"target2_{meta_info['suffix']}.pt")).state_dict())
            else:
                raise NotImplementedError

    def reset(self, seed: int = None):
        # Reset the exploration strategy, internal variables and the random seeds
        super().reset(seed)

        # Re-initialize sampler in case env or policy changed
        self.sampler.reinit()

        # Reset the replay memory
        self._memory.reset()

        # Reset the learning rate schedulers
        if self._lr_scheduler_policy is not None:
            self._lr_scheduler_policy.last_epoch = -1
        if self._lr_scheduler_q_fcn_1 is not None:
            self._lr_scheduler_q_fcn_1.last_epoch = -1
        if self._lr_scheduler_q_fcn_2 is not None:
            self._lr_scheduler_q_fcn_2.last_epoch = -1

Пример #14

Показать файл

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 value_fcn_coeff: float = 0.5,
                 entropy_coeff: float = 1e-3,
                 batch_size: int = 32,
                 std_init: float = 1.0,
                 max_grad_norm: float = None,
                 num_sampler_envs: int = 4,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param value_fcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2
        :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param num_sampler_envs: number of environments for parallel sampling
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm
        """
        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.min_rollouts = min_rollouts
        self.min_steps = min_steps
        self.value_fcn_coeff = value_fcn_coeff
        self.entropy_coeff = entropy_coeff
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = NormalActNoiseExplStrat(self._policy,
                                                   std_init=std_init)
        self.sampler = ParallelSampler(env,
                                       self.expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=min_steps,
                                       min_rollouts=min_rollouts)
        self.optim = to.optim.RMSprop(
            [{
                'params': self._policy.parameters()
            }, {
                'params': self.expl_strat.noise.parameters()
            }, {
                'params': self._critic.value_fcn.parameters()
            }],
            lr=lr,
            eps=1e-5)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)

Пример #15

Показать файл

    plt.show()


if __name__ == '__main__':
    # Set up environment
    dp_gt = dict(m=2., k=20., d=0.8)  # ground truth
    dp_init = dict(m=1.0, k=24., d=0.4)  # initial guess
    dt = 1/50.
    env = OneMassOscillatorSim(dt=dt, max_steps=400)
    env.reset(domain_param=dp_gt)

    # Set up policy
    policy = DummyPolicy(env.spec)

    # Sample
    sampler = ParallelSampler(env, policy, num_envs=1, min_rollouts=50, seed=1)
    ros = sampler.sample()

    # Pyro
    pyro.set_rng_seed(1001)
    pyro.enable_validation(True)

    train(
        SVI(model=model,
            guide=guide,
            optim=optim.Adam({'lr': 0.01}),
            # optim=optim.SGD({'lr': 0.001, 'momentum': 0.1}),
            loss=Trace_ELBO()),
        rollouts=ros, prior=dp_init
    )

Пример #16

Показать файл

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: DiscrActQValFNNPolicy,
                 memory_size: int,
                 eps_init: float,
                 eps_schedule_gamma: float,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 target_update_intvl: int = 5,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 batch_size: int = 256,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 0.5,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: environment which the policy operates
        :param policy: (current) Q-network updated by this algorithm
        :param memory_size: number of transitions in the replay memory buffer
        :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma==1`
        :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, DiscrActQValFNNPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=DiscrActQValFNNPolicy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.target = deepcopy(self._policy)
        self.target.eval()  # will not be trained using the optimizer
        self._memory_size = memory_size
        self.eps = eps_init
        self.gamma = gamma
        self.target_update_intvl = target_update_intvl
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init, eps_schedule_gamma)
        self._memory = ReplayMemory(memory_size)
        self.sampler = ParallelSampler(
            env, self._expl_strat,
            num_envs=1,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.sampler_eval = ParallelSampler(
            env, self._policy,
            num_envs=num_sampler_envs,
            min_steps=100*env.max_steps,
            min_rollouts=None
        )
        self.optim = to.optim.RMSprop([{'params': self._policy.parameters()}], lr=lr)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)

Пример #17

Показать файл

class DQL(Algorithm):
    """
    Deep Q-Learning (without bells and whistles)

    .. seealso::
        [1] V. Mnih et.al., "Human-level control through deep reinforcement learning", Nature, 2015
    """

    name: str = 'dql'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: DiscrActQValFNNPolicy,
                 memory_size: int,
                 eps_init: float,
                 eps_schedule_gamma: float,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 target_update_intvl: int = 5,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 batch_size: int = 256,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 0.5,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: environment which the policy operates
        :param policy: (current) Q-network updated by this algorithm
        :param memory_size: number of transitions in the replay memory buffer
        :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma==1`
        :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, DiscrActQValFNNPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=DiscrActQValFNNPolicy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.target = deepcopy(self._policy)
        self.target.eval()  # will not be trained using the optimizer
        self._memory_size = memory_size
        self.eps = eps_init
        self.gamma = gamma
        self.target_update_intvl = target_update_intvl
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init, eps_schedule_gamma)
        self._memory = ReplayMemory(memory_size)
        self.sampler = ParallelSampler(
            env, self._expl_strat,
            num_envs=1,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.sampler_eval = ParallelSampler(
            env, self._policy,
            num_envs=num_sampler_envs,
            min_steps=100*env.max_steps,
            min_rollouts=None
        )
        self.optim = to.optim.RMSprop([{'params': self._policy.parameters()}], lr=lr)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)

    @property
    def expl_strat(self) -> EpsGreedyExplStrat:
        return self._expl_strat

    @property
    def memory(self) -> ReplayMemory:
        """ Get the replay memory. """
        return self._memory

    def step(self, snapshot_mode: str, meta_info: dict = None):
        # Sample steps and store them in the replay memory
        ros = self.sampler.sample()
        self._memory.push(ros)

        while len(self._memory) < self.memory.capacity:
            # Warm-up phase
            print_cbt('Collecting samples until replay memory contains if full.', 'w')
            # Sample steps and store them in the replay memory
            ros = self.sampler.sample()
            self._memory.push(ros)

        # Log return-based metrics
        if self._curr_iter % self.logger.print_interval == 0:
            ros = self.sampler_eval.sample()
            rets = [ro.undiscounted_return() for ro in ros]
            ret_max = np.max(rets)
            ret_med = np.median(rets)
            ret_avg = np.mean(rets)
            ret_min = np.min(rets)
            ret_std = np.std(rets)
        else:
            ret_max, ret_med, ret_avg, ret_min, ret_std = 5*[-pyrado.inf]  # dummy values
        self.logger.add_value('max return', np.round(ret_max, 4))
        self.logger.add_value('median return', np.round(ret_med, 4))
        self.logger.add_value('avg return', np.round(ret_avg, 4))
        self.logger.add_value('min return', np.round(ret_min, 4))
        self.logger.add_value('std return', np.round(ret_std, 4))
        self.logger.add_value('avg rollout length', np.round(np.mean([ro.length for ro in ros]), 2))
        self.logger.add_value('num rollouts', len(ros))
        self.logger.add_value('avg memory reward', np.round(self._memory.avg_reward(), 4))

        # Use data in the memory to update the policy and the target Q-function
        self.update()

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)

    def loss_fcn(self, q_vals: to.Tensor, expected_q_vals: to.Tensor) -> to.Tensor:
        r"""
        The Huber loss function on the one-step TD error $\delta = Q(s,a) - (r + \gamma \max_a Q(s^\prime, a))$.

        :param q_vals: state-action values $Q(s,a)$, from policy network
        :param expected_q_vals: expected state-action values $r + \gamma \max_a Q(s^\prime, a)$, from target network
        :return: loss value
        """
        return nn.functional.smooth_l1_loss(q_vals, expected_q_vals)

    def update(self):
        """ Update the policy's and target Q-function's parameters on transitions sampled from the replay memory. """
        losses = to.zeros(self.num_batch_updates)
        policy_grad_norm = to.zeros(self.num_batch_updates)

        for b in tqdm(range(self.num_batch_updates), total=self.num_batch_updates,
                      desc=f'Updating', unit='batches', file=sys.stdout, leave=False):

            # Sample steps and the associated next step from the replay memory
            steps, next_steps = self._memory.sample(self.batch_size)
            steps.torch(data_type=to.get_default_dtype())
            next_steps.torch(data_type=to.get_default_dtype())

            # Create masks for the non-final observations
            not_done = to.tensor(1. - steps.done, dtype=to.get_default_dtype())

            # Compute the state-action values Q(s,a) using the current DQN policy
            q_vals = self.expl_strat.policy.q_values_chosen(steps.observations)

            # Compute the second term of TD-error
            next_v_vals = self.target.q_values_chosen(next_steps.observations).detach()
            expected_q_val = steps.rewards + not_done*self.gamma*next_v_vals

            # Compute the loss, clip the gradients if desired, and do one optimization step
            loss = self.loss_fcn(q_vals, expected_q_val)
            losses[b] = loss.data
            self.optim.zero_grad()
            loss.backward()
            policy_grad_norm[b] = self.clip_grad(self.expl_strat.policy, self.max_grad_norm)
            self.optim.step()

            # Update the target network by copying all weights and biases from the DQN policy
            if (self._curr_iter*self.num_batch_updates + b)%self.target_update_intvl == 0:
                self.target.load_state_dict(self.expl_strat.policy.state_dict())

        # Schedule the exploration parameter epsilon
        self.expl_strat.schedule_eps(self._curr_iter)

        # Update the learning rate if a scheduler has been specified
        if self._lr_scheduler is not None:
            self._lr_scheduler.step()

        # Logging
        with to.no_grad():
            self.logger.add_value('loss after', to.mean(losses).item())
        self.logger.add_value('expl strat eps', self.expl_strat.eps.item())
        self.logger.add_value('avg policy grad norm', to.mean(policy_grad_norm).item())
        if self._lr_scheduler is not None:
            self.logger.add_value('learning rate', self._lr_scheduler.get_lr())

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This instance is not a subroutine of a meta-algorithm
            joblib.dump(self._env, osp.join(self._save_dir, 'env.pkl'))
            to.save(self.target, osp.join(self._save_dir, 'target.pt'))
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                to.save(self.target,
                        osp.join(self._save_dir, f"{meta_info['prefix']}_target_{meta_info['suffix']}.pt"))
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                to.save(self.target, osp.join(self._save_dir, f"{meta_info['prefix']}_target.pt"))
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                to.save(self.target, osp.join(self._save_dir, f"target_{meta_info['suffix']}.pt"))
            else:
                raise NotImplementedError

    def load_snapshot(self, load_dir: str = None, meta_info: dict = None):
        # Get the directory to load from
        ld = load_dir if load_dir is not None else self._save_dir
        super().load_snapshot(ld, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of a meta-algorithm
            self._env = joblib.load(osp.join(ld, 'env.pkl'))
            self.target.load_state_dict(to.load(osp.join(ld, 'target.pt')).state_dict())
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                self.target.load_state_dict(
                    to.load(osp.join(ld, f"{meta_info['prefix']}_target_{meta_info['suffix']}.pt")).state_dict()
                )
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                self.target.load_state_dict(
                    to.load(osp.join(ld, f"{meta_info['prefix']}_target.pt")).state_dict()
                )
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                self.target.load_state_dict(
                    to.load(osp.join(ld, f"target_{meta_info['suffix']}.pt")).state_dict()
                )
            else:
                raise NotImplementedError

    def reset(self, seed: int = None):
        # Reset the exploration strategy, internal variables and the random seeds
        super().reset(seed)

        # Re-initialize sampler in case env or policy changed
        self.sampler.reinit()

        # Reset the replay memory
        self._memory.reset()

        # Reset the learning rate scheduler
        if self._lr_scheduler is not None:
            self._lr_scheduler.last_epoch = -1