Пример #1
0
def test_cuda_sampling_w_dr(default_bob, bob_pert):
    # Add randomizer
    env = DomainRandWrapperLive(default_bob, bob_pert)

    # Use a simple policy
    policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh, use_cuda=True)

    # Create the sampler
    sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=10)

    samples = sampler.sample()
    assert samples is not None
Пример #2
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_sampler_envs: int = 4,
                 logger: StepLogger = None,
                 sampler: SamplerBase = None,
                 ball_z_dim_mismatch: bool = True):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy which this algorithm is creating
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim,
                                    set to True if the controller does not have the z component (relative position)
                                    of the ball in the state vector, i.e. state is 14-dim instead of 16-dim
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, LinearPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=LinearPolicy)

        # Call Algorithm's constructor
        super().__init__(save_dir, 1, policy, logger)

        # Store the inputs
        self._env = env
        self.ball_z_dim_mismatch = ball_z_dim_mismatch

        # Initialize variables for checking and evaluating
        if sampler is None:
            sampler = ParallelSampler(env,
                                      self._policy,
                                      num_envs=num_sampler_envs,
                                      min_steps=min_steps,
                                      min_rollouts=min_rollouts)
        self.sampler = sampler
        self.eigvals = np.array([pyrado.inf])  # initialize with sth positive
Пример #3
0
def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=100,
        reward_multiplier=1,
        logger=None
    )
    policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh)
    dr = get_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(format='dict', dtype='numpy')
    reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000)
    random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000)

    losses = []
    for i in range(50):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]
Пример #4
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1/250., max_steps=1500)
    env = ActNormWrapper(env)

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )

    # Critic
    value_fcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    critic_hparam = dict(
        gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        batch_size=100,
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=500,
        min_steps=25*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        batch_size=100,
        std_init=0.9,
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret
Пример #5
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_epoch: int = 3,
                 eps_clip: float = 0.1,
                 batch_size: int = 64,
                 std_init: float = 1.0,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = None,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epoch: number of iterations over all gathered samples during one policy update
        :param eps_clip: max/min probability ratio, see [1]
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created

        .. note::
            The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler
            schedules the maximum learning rate.
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        assert isinstance(policy, Policy)

        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.num_epoch = num_epoch
        self.eps_clip = eps_clip
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self.log_loss = True
        self._expl_strat = NormalActNoiseExplStrat(self._policy,
                                                   std_init=std_init)
        self.sampler = ParallelSampler(env,
                                       self._expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=min_steps,
                                       min_rollouts=min_rollouts)
        self.optim = to.optim.Adam(
            [{
                'params': self._expl_strat.policy.parameters()
            }, {
                'params': self._expl_strat.noise.parameters()
            }],
            lr=lr,
            eps=1e-5)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)
Пример #6
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env = QQubeSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            'shared_hidden_sizes_policy',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('shared_hidden_nonlin_policy',
                                      ['to_tanh', 'to_relu'])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    q_fcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical(
            'hidden_sizes_critic',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('hidden_nonlin_critic',
                                      ['to_tanh', 'to_relu'])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)
    q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            'min_steps_algo', [1]),  # , 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform('memory_size_algo',
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform('tau_algo', 0.99, 1.),
        alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9),
        learn_alpha=trial.suggest_categorical('learn_alpha_algo',
                                              [True, False]),
        standardize_rew=trial.suggest_categorical('standardize_rew_algo',
                                                  [False]),
        gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.),
        target_update_intvl=trial.suggest_categorical(
            'target_update_intvl_algo', [1, 5]),
        num_batch_updates=trial.suggest_categorical('num_batch_updates_algo',
                                                    [1, 5]),
        batch_size=trial.suggest_categorical('batch_size_algo',
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(ex_dir, f'trial_{trial.number}'))
    algo = SAC(ex_dir,
               env,
               policy,
               q_fcn_1,
               q_fcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(
        env, policy, num_envs=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Пример #7
0
    def eval_policy(save_dir: [str, None],
                    env_real: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy, montecarlo_estimator: bool, prefix: str,
                    num_rollouts: int) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env_real: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param montecarlo_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :return: estimated return in the target domain
        """
        if isinstance(env_real, RealEnv):
            input('Evaluating in the target domain. Hit any key to continue.')
        if save_dir is not None:
            print_cbt(f'Evaluating {prefix}_policy on the target system ...',
                      'c',
                      bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(env_real, RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env_real,
                                       policy,
                                       eval=True,
                                       no_close=False).undiscounted_return()
        elif isinstance(env_real, (SimEnv, MetaDomainRandWrapper)):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelSampler(env_real,
                                      policy,
                                      num_envs=1,
                                      min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(
                given=env_real,
                expected_type=[RealEnv, SimEnv, MetaDomainRandWrapper])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('target domain performance', bright=True)
            print(
                tabulate([['mean return',
                           to.mean(rets_real).item()],
                          ['std return', to.std(rets_real)],
                          ['min return', to.min(rets_real)],
                          ['max return', to.max(rets_real)]]))

        if montecarlo_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(
                bootstrap_ci(rets_real.numpy(),
                             np.mean,
                             num_reps=1000,
                             alpha=0.05,
                             ci_sides=1,
                             studentized=False)[1])
Пример #8
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 particle_hparam: dict,
                 max_iter: int,
                 num_particles: int,
                 temperature: float,
                 lr: float,
                 horizon: int,
                 std_init: float = 1.0,
                 min_rollouts: int = None,
                 min_steps: int = 10000,
                 num_sampler_envs: int = 4,
                 serial: bool = True,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param particle_hparam: hyper-parameters for particle template construction
        :param max_iter: number of iterations
        :param num_particles: number of distinct particles
        :param temperature: the temperature of the SVGD determines how jointly the training takes place
        :param lr: the learning rate for the update of the particles
        :param horizon: horizon for each particle
        :param std_init: initial standard deviation for the exploration
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(particle_hparam, dict):
            raise pyrado.TypeErr(given=particle_hparam, expected_type=dict)
        if not all([
                key in particle_hparam
                for key in ['actor', 'value_fcn', 'critic']
        ]):
            raise AttributeError

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy=None, logger=logger)

        # Store the inputs
        self._env = env
        self.num_particles = num_particles
        self.horizon = horizon  # TODO @Robin: where is the horizon used?!
        self.lr = lr
        self.temperature = temperature
        self.serial = serial

        # Prepare placeholders for particles
        self.particles = [None] * num_particles
        self.expl_strats = [None] * num_particles
        self.optimizers = [None] * num_particles
        self.fixed_particles = [None] * num_particles
        self.fixed_expl_strats = [None] * num_particles
        self.samplers = [None] * num_particles
        self.count = 0
        self.updatecount = 0

        # Particle factory
        actor = FNNPolicy(spec=env.spec, **particle_hparam['actor'])
        value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                              **particle_hparam['value_fcn'])
        critic = GAE(value_fcn, **particle_hparam['critic'])
        particle = SVPGParticle(env.spec, actor, critic)

        for i in range(self.num_particles):
            self.particles[i] = deepcopy(particle)
            self.particles[i].init_param()
            self.expl_strats[i] = NormalActNoiseExplStrat(
                self.particles[i].actor, std_init)
            self.optimizers[i] = to.optim.Adam(
                self.expl_strats[i].parameters(), lr=self.lr)
            self.fixed_particles[i] = deepcopy(self.particles[i])
            self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i])

            if self.serial:
                self.samplers[i] = ParallelSampler(env,
                                                   self.expl_strats[i],
                                                   num_sampler_envs,
                                                   min_rollouts=min_rollouts,
                                                   min_steps=min_steps)
Пример #9
0
    def __init__(self,
                 save_dir: str,
                 env: [SimEnv, StateAugmentationWrapper],
                 subroutine: Algorithm,
                 policy: Policy,
                 expl_strat: StochasticActionExplStrat,
                 max_iter: int,
                 num_rollouts: int = None,
                 steps_num: int = None,
                 apply_dynamics_noise: bool = False,
                 dyn_eps: float = 0.01,
                 dyn_phi: float = 0.1,
                 halfspan: float = 0.25,
                 apply_proccess_noise: bool = False,
                 proc_eps: float = 0.01,
                 proc_phi: float = 0.05,
                 apply_observation_noise: bool = False,
                 obs_eps: float = 0.01,
                 obs_phi: float = 0.05,
                 torch_observation: bool = True,
                 base_seed: int = None,
                 num_sampler_envs: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment in which the agent should be trained
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param policy: policy to be updated
        :param expl_strat: the exploration strategy
        :param max_iter: the maximum number of iterations
        :param num_rollouts: the number of rollouts to be performed for each update step
        :param steps_num: the number of steps to be performed for each update step
        :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied
        :param dyn_eps: the intensity of generated dynamics noise
        :param dyn_phi: the probability of applying dynamics noise
        :param halfspan: the halfspan of the uniform random distribution used to sample
        :param apply_proccess_noise: whether adversarially generated process noise should be applied
        :param proc_eps: the intensity of generated process noise
        :param proc_phi: the probability of applying process noise
        :param apply_observation_noise: whether adversarially generated observation noise should be applied
        :param obs_eps: the intensity of generated observation noise
        :param obs_phi: the probability of applying observation noise
        :param torch_observation: a function to provide a differentiable observation
        :param base_seed: the random seed
        :param num_sampler_envs: number of environments for parallel sampling
        :param logger: the logger
        """
        assert isinstance(subroutine, Algorithm)
        assert isinstance(max_iter, int) and max_iter > 0

        super().__init__(save_dir, max_iter, policy, logger)
        # Get the randomized environment (recommended to make it the most outer one in the chain)

        # Initialize adversarial wrappers
        if apply_dynamics_noise:
            assert isinstance(env, StateAugmentationWrapper)
            env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps,
                                             dyn_phi, halfspan)
        if apply_proccess_noise:
            env = AdversarialStateWrapper(env,
                                          self.policy,
                                          proc_eps,
                                          proc_phi,
                                          torch_observation=torch_observation)
        if apply_observation_noise:
            env = AdversarialObservationWrapper(env, self.policy, obs_eps,
                                                obs_phi)

        self.num_rollouts = num_rollouts
        self.sampler = ParallelSampler(env,
                                       expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=steps_num,
                                       min_rollouts=num_rollouts,
                                       seed=base_seed)
        self._subroutine = subroutine
Пример #10
0
class ARPL(Algorithm):
    """
    Adversarially Robust Policy Learning (ARPL)

    .. seealso::
        A. Mandlekar, Y. Zhu, A. Garg, L. Fei-Fei, S. Savarese, "Adversarially Robust Policy Learning:
        Active Construction of Physically-Plausible Perturbations", IROS, 2017
    """

    name: str = 'arpl'

    def __init__(self,
                 save_dir: str,
                 env: [SimEnv, StateAugmentationWrapper],
                 subroutine: Algorithm,
                 policy: Policy,
                 expl_strat: StochasticActionExplStrat,
                 max_iter: int,
                 num_rollouts: int = None,
                 steps_num: int = None,
                 apply_dynamics_noise: bool = False,
                 dyn_eps: float = 0.01,
                 dyn_phi: float = 0.1,
                 halfspan: float = 0.25,
                 apply_proccess_noise: bool = False,
                 proc_eps: float = 0.01,
                 proc_phi: float = 0.05,
                 apply_observation_noise: bool = False,
                 obs_eps: float = 0.01,
                 obs_phi: float = 0.05,
                 torch_observation: bool = True,
                 base_seed: int = None,
                 num_sampler_envs: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment in which the agent should be trained
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param policy: policy to be updated
        :param expl_strat: the exploration strategy
        :param max_iter: the maximum number of iterations
        :param num_rollouts: the number of rollouts to be performed for each update step
        :param steps_num: the number of steps to be performed for each update step
        :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied
        :param dyn_eps: the intensity of generated dynamics noise
        :param dyn_phi: the probability of applying dynamics noise
        :param halfspan: the halfspan of the uniform random distribution used to sample
        :param apply_proccess_noise: whether adversarially generated process noise should be applied
        :param proc_eps: the intensity of generated process noise
        :param proc_phi: the probability of applying process noise
        :param apply_observation_noise: whether adversarially generated observation noise should be applied
        :param obs_eps: the intensity of generated observation noise
        :param obs_phi: the probability of applying observation noise
        :param torch_observation: a function to provide a differentiable observation
        :param base_seed: the random seed
        :param num_sampler_envs: number of environments for parallel sampling
        :param logger: the logger
        """
        assert isinstance(subroutine, Algorithm)
        assert isinstance(max_iter, int) and max_iter > 0

        super().__init__(save_dir, max_iter, policy, logger)
        # Get the randomized environment (recommended to make it the most outer one in the chain)

        # Initialize adversarial wrappers
        if apply_dynamics_noise:
            assert isinstance(env, StateAugmentationWrapper)
            env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps,
                                             dyn_phi, halfspan)
        if apply_proccess_noise:
            env = AdversarialStateWrapper(env,
                                          self.policy,
                                          proc_eps,
                                          proc_phi,
                                          torch_observation=torch_observation)
        if apply_observation_noise:
            env = AdversarialObservationWrapper(env, self.policy, obs_eps,
                                                obs_phi)

        self.num_rollouts = num_rollouts
        self.sampler = ParallelSampler(env,
                                       expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=steps_num,
                                       min_rollouts=num_rollouts,
                                       seed=base_seed)
        self._subroutine = subroutine

    def step(self, snapshot_mode: str, meta_info: dict = None):
        rollouts = self.sampler.sample()
        rets = [ro.undiscounted_return() for ro in rollouts]
        ret_avg = np.mean(rets)
        ret_med = np.median(rets)
        ret_std = np.std(rets)
        self.logger.add_value('num rollouts', len(rollouts))
        self.logger.add_value('avg rollout len',
                              np.mean([ro.length for ro in rollouts]))
        self.logger.add_value('avg return', ret_avg)
        self.logger.add_value('median return', ret_med)
        self.logger.add_value('std return', ret_std)

        # Sub-routine
        self._subroutine.update(rollouts)
        self._subroutine.logger.record_step()
        self._subroutine.make_snapshot(snapshot_mode, ret_avg.item())
Пример #11
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: TwoHeadedPolicy,
                 q_fcn_1: Policy,
                 q_fcn_2: Policy,
                 memory_size: int,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 tau: float = 0.995,
                 alpha_init: float = 0.2,
                 learn_alpha: bool = True,
                 target_update_intvl: int = 1,
                 standardize_rew: bool = True,
                 batch_size: int = 500,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 5.,
                 lr: float = 3e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param q_fcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param q_fcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak
                    update, between 0 and 1
        :param alpha_init: initial weighting factor of the entropy term in the loss function
        :param learn_alpha: adapt the weighting factor of the entropy term
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param standardize_rew: bool to flag if the rewards should be standardized
        :param batch_size: number of samples per policy update batch
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step
                             per `update()` call
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if typed_env(env, ActNormWrapper) is None:
            raise pyrado.TypeErr(
                msg='SAC required an environment wrapped by an ActNormWrapper!'
            )
        if not isinstance(q_fcn_1, Policy):
            raise pyrado.TypeErr(given=q_fcn_1, expected_type=Policy)
        if not isinstance(q_fcn_2, Policy):
            raise pyrado.TypeErr(given=q_fcn_2, expected_type=Policy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(
                CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.q_fcn_1 = q_fcn_1
        self.q_fcn_2 = q_fcn_2
        self.q_targ_1 = deepcopy(self.q_fcn_1)
        self.q_targ_2 = deepcopy(self.q_fcn_2)
        self.q_targ_1.eval()
        self.q_targ_2.eval()
        self.gamma = gamma
        self.tau = tau
        self.learn_alpha = learn_alpha
        self.target_update_intvl = target_update_intvl
        self.standardize_rew = standardize_rew
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._memory = ReplayMemory(memory_size)
        if policy.is_recurrent:
            init_expl_policy = RecurrentDummyPolicy(env.spec,
                                                    policy.hidden_size)
        else:
            init_expl_policy = DummyPolicy(env.spec)
        self.sampler_init = ParallelSampler(
            env,
            init_expl_policy,  # samples uniformly random from the action space
            num_envs=num_sampler_envs,
            min_steps=memory_size,
        )
        self._expl_strat = SACExplStrat(
            self._policy,
            std_init=1.)  # std_init will be overwritten by 2nd policy head
        self.sampler = ParallelSampler(
            env,
            self._expl_strat,
            num_envs=1,
            min_steps=min_steps,  # in [2] this would be 1
            min_rollouts=min_rollouts  # in [2] this would be None
        )
        self.sampler_eval = ParallelSampler(env,
                                            self._policy,
                                            num_envs=num_sampler_envs,
                                            min_steps=100 * env.max_steps,
                                            min_rollouts=None)
        self._optim_policy = to.optim.Adam([{
            'params': self._policy.parameters()
        }],
                                           lr=lr)
        self._optim_q_fcn_1 = to.optim.Adam(
            [{
                'params': self.q_fcn_1.parameters()
            }], lr=lr)
        self._optim_q_fcn_2 = to.optim.Adam(
            [{
                'params': self.q_fcn_2.parameters()
            }], lr=lr)
        log_alpha_init = to.log(
            to.tensor(alpha_init, dtype=to.get_default_dtype()))
        if learn_alpha:
            # Automatic entropy tuning
            self._log_alpha = nn.Parameter(log_alpha_init, requires_grad=True)
            self._alpha_optim = to.optim.Adam([{
                'params': self._log_alpha
            }],
                                              lr=lr)
            self.target_entropy = -to.prod(to.tensor(env.act_space.shape))
        else:
            self._log_alpha = log_alpha_init

        self._lr_scheduler_policy = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler_policy = lr_scheduler(self._optim_policy,
                                                     **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_1 = lr_scheduler(self._optim_q_fcn_1,
                                                      **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_2 = lr_scheduler(self._optim_q_fcn_2,
                                                      **lr_scheduler_hparam)
Пример #12
0
"""
Script to sample some rollouts using the ParallelSampler
"""
from tabulate import tabulate

from pyrado.environment_wrappers.action_normalization import ActNormWrapper
from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim
from pyrado.policies.features import FeatureStack, identity_feat, squared_feat
from pyrado.policies.linear import LinearPolicy
from pyrado.sampling.parallel_sampler import ParallelSampler

if __name__ == '__main__':
    # Set up environment
    env = BallOnBeamSim(dt=0.02, max_steps=500)
    env = ActNormWrapper(env)

    # Set up policy
    feats = FeatureStack([identity_feat, squared_feat])
    policy = LinearPolicy(env.spec, feats)

    # Set up sampler
    sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=2000)

    # Sample and print
    ros = sampler.sample()
    print(
        tabulate({
            'StepSequence count': len(ros),
            'Step count': sum(map(len, ros)),
        }.items()))
Пример #13
0
class SAC(Algorithm):
    """
    Soft Actor-Critic (SAC) variant with stochastic policy and two Q-functions and two Q-targets (no V-function)

    .. seealso::
        [1] T. Haarnoja, A. Zhou, P. Abbeel, S. Levine, "Soft Actor-Critic: Off-Policy Maximum Entropy Deep
        Reinforcement Learning with a Stochastic Actor", ICML, 2018

        [2] This implementation was inspired by https://github.com/pranz24/pytorch-soft-actor-critic
            which is seems to be based on https://github.com/vitchyr/rlkit
    """

    name: str = 'sac'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: TwoHeadedPolicy,
                 q_fcn_1: Policy,
                 q_fcn_2: Policy,
                 memory_size: int,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 tau: float = 0.995,
                 alpha_init: float = 0.2,
                 learn_alpha: bool = True,
                 target_update_intvl: int = 1,
                 standardize_rew: bool = True,
                 batch_size: int = 500,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 5.,
                 lr: float = 3e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param q_fcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param q_fcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak
                    update, between 0 and 1
        :param alpha_init: initial weighting factor of the entropy term in the loss function
        :param learn_alpha: adapt the weighting factor of the entropy term
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param standardize_rew: bool to flag if the rewards should be standardized
        :param batch_size: number of samples per policy update batch
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step
                             per `update()` call
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if typed_env(env, ActNormWrapper) is None:
            raise pyrado.TypeErr(
                msg='SAC required an environment wrapped by an ActNormWrapper!'
            )
        if not isinstance(q_fcn_1, Policy):
            raise pyrado.TypeErr(given=q_fcn_1, expected_type=Policy)
        if not isinstance(q_fcn_2, Policy):
            raise pyrado.TypeErr(given=q_fcn_2, expected_type=Policy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(
                CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.q_fcn_1 = q_fcn_1
        self.q_fcn_2 = q_fcn_2
        self.q_targ_1 = deepcopy(self.q_fcn_1)
        self.q_targ_2 = deepcopy(self.q_fcn_2)
        self.q_targ_1.eval()
        self.q_targ_2.eval()
        self.gamma = gamma
        self.tau = tau
        self.learn_alpha = learn_alpha
        self.target_update_intvl = target_update_intvl
        self.standardize_rew = standardize_rew
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._memory = ReplayMemory(memory_size)
        if policy.is_recurrent:
            init_expl_policy = RecurrentDummyPolicy(env.spec,
                                                    policy.hidden_size)
        else:
            init_expl_policy = DummyPolicy(env.spec)
        self.sampler_init = ParallelSampler(
            env,
            init_expl_policy,  # samples uniformly random from the action space
            num_envs=num_sampler_envs,
            min_steps=memory_size,
        )
        self._expl_strat = SACExplStrat(
            self._policy,
            std_init=1.)  # std_init will be overwritten by 2nd policy head
        self.sampler = ParallelSampler(
            env,
            self._expl_strat,
            num_envs=1,
            min_steps=min_steps,  # in [2] this would be 1
            min_rollouts=min_rollouts  # in [2] this would be None
        )
        self.sampler_eval = ParallelSampler(env,
                                            self._policy,
                                            num_envs=num_sampler_envs,
                                            min_steps=100 * env.max_steps,
                                            min_rollouts=None)
        self._optim_policy = to.optim.Adam([{
            'params': self._policy.parameters()
        }],
                                           lr=lr)
        self._optim_q_fcn_1 = to.optim.Adam(
            [{
                'params': self.q_fcn_1.parameters()
            }], lr=lr)
        self._optim_q_fcn_2 = to.optim.Adam(
            [{
                'params': self.q_fcn_2.parameters()
            }], lr=lr)
        log_alpha_init = to.log(
            to.tensor(alpha_init, dtype=to.get_default_dtype()))
        if learn_alpha:
            # Automatic entropy tuning
            self._log_alpha = nn.Parameter(log_alpha_init, requires_grad=True)
            self._alpha_optim = to.optim.Adam([{
                'params': self._log_alpha
            }],
                                              lr=lr)
            self.target_entropy = -to.prod(to.tensor(env.act_space.shape))
        else:
            self._log_alpha = log_alpha_init

        self._lr_scheduler_policy = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler_policy = lr_scheduler(self._optim_policy,
                                                     **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_1 = lr_scheduler(self._optim_q_fcn_1,
                                                      **lr_scheduler_hparam)
            self._lr_scheduler_q_fcn_2 = lr_scheduler(self._optim_q_fcn_2,
                                                      **lr_scheduler_hparam)

    @property
    def expl_strat(self) -> SACExplStrat:
        return self._expl_strat

    @property
    def memory(self) -> ReplayMemory:
        """ Get the replay memory. """
        return self._memory

    @property
    def alpha(self) -> to.Tensor:
        """ Get the detached entropy coefficient. """
        return to.exp(self._log_alpha.detach())

    def step(self, snapshot_mode: str, meta_info: dict = None):
        if self._memory.isempty:
            # Warm-up phase
            print_cbt(
                'Collecting samples until replay memory contains if full.',
                'w')
            # Sample steps and store them in the replay memory
            ros = self.sampler_init.sample()
            self._memory.push(ros)
        else:
            # Sample steps and store them in the replay memory
            ros = self.sampler.sample()
            self._memory.push(ros)

        # Log return-based metrics
        if self._curr_iter % self.logger.print_interval == 0:
            ros = self.sampler_eval.sample()
            rets = [ro.undiscounted_return() for ro in ros]
            ret_max = np.max(rets)
            ret_med = np.median(rets)
            ret_avg = np.mean(rets)
            ret_min = np.min(rets)
            ret_std = np.std(rets)
        else:
            ret_max, ret_med, ret_avg, ret_min, ret_std = 5 * [
                -pyrado.inf
            ]  # dummy values
        self.logger.add_value('max return', np.round(ret_max, 4))
        self.logger.add_value('median return', np.round(ret_med, 4))
        self.logger.add_value('avg return', np.round(ret_avg, 4))
        self.logger.add_value('min return', np.round(ret_min, 4))
        self.logger.add_value('std return', np.round(ret_std, 4))
        self.logger.add_value('avg rollout length',
                              np.round(np.mean([ro.length for ro in ros]), 2))
        self.logger.add_value('num rollouts', len(ros))
        self.logger.add_value('avg memory reward',
                              np.round(self._memory.avg_reward(), 4))

        # Use data in the memory to update the policy and the Q-functions
        self.update()

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)

    @staticmethod
    def soft_update(target: nn.Module, source: nn.Module, tau: float = 0.995):
        """
        Moving average update, a.k.a. Polyak update.
        Modifies the input argument `target`.

        :param target: PyTroch module with parameters to be updated
        :param source: PyTroch module with parameters to update to
        :param tau: interpolation factor for averaging, between 0 and 1
        """
        if not 0 < tau < 1:
            raise pyrado.ValueErr(given=tau,
                                  g_constraint='0',
                                  l_constraint='1')

        for targ_param, src_param in zip(target.parameters(),
                                         source.parameters()):
            targ_param.data = targ_param.data * tau + src_param.data * (1. -
                                                                        tau)

    def update(self):
        """ Update the policy's and Q-functions' parameters on transitions sampled from the replay memory. """
        # Containers for logging
        policy_losses = to.zeros(self.num_batch_updates)
        expl_strat_stds = to.zeros(self.num_batch_updates)
        q_fcn_1_losses = to.zeros(self.num_batch_updates)
        q_fcn_2_losses = to.zeros(self.num_batch_updates)
        policy_grad_norm = to.zeros(self.num_batch_updates)
        q_fcn_1_grad_norm = to.zeros(self.num_batch_updates)
        q_fcn_2_grad_norm = to.zeros(self.num_batch_updates)

        for b in tqdm(range(self.num_batch_updates),
                      total=self.num_batch_updates,
                      desc=f'Updating',
                      unit='batches',
                      file=sys.stdout,
                      leave=False):

            # Sample steps and the associated next step from the replay memory
            steps, next_steps = self._memory.sample(self.batch_size)
            steps.torch(data_type=to.get_default_dtype())
            next_steps.torch(data_type=to.get_default_dtype())

            # Standardize rewards
            if self.standardize_rew:
                rewards = standardize(steps.rewards).unsqueeze(1)
            else:
                rewards = steps.rewards.unsqueeze(1)
            rew_scale = 1.
            rewards *= rew_scale

            with to.no_grad():
                # Create masks for the non-final observations
                not_done = to.tensor(1. - steps.done,
                                     dtype=to.get_default_dtype()).unsqueeze(1)

                # Compute the (next)state-(next)action values Q(s',a') from the target networks
                if self.policy.is_recurrent:
                    next_act_expl, next_log_probs, _ = self._expl_strat(
                        next_steps.observations, next_steps.hidden_states)
                else:
                    next_act_expl, next_log_probs = self._expl_strat(
                        next_steps.observations)
                next_q_val_target_1 = self.q_targ_1(
                    to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_2 = self.q_targ_2(
                    to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_min = to.min(
                    next_q_val_target_1,
                    next_q_val_target_2) - self.alpha * next_log_probs
                next_q_val = rewards + not_done * self.gamma * next_q_val_target_min

            # Compute the two Q-function losses
            # E_{(s_t, a_t) ~ D} [1/2 * (Q_i(s_t, a_t) - r_t - gamma * E_{s_{t+1} ~ p} [V(s_{t+1})] )^2]
            q_val_1 = self.q_fcn_1(
                to.cat([steps.observations, steps.actions], dim=1))
            q_val_2 = self.q_fcn_2(
                to.cat([steps.observations, steps.actions], dim=1))
            q_1_loss = nn.functional.mse_loss(q_val_1, next_q_val)
            q_2_loss = nn.functional.mse_loss(q_val_2, next_q_val)
            q_fcn_1_losses[b] = q_1_loss.data
            q_fcn_2_losses[b] = q_2_loss.data

            # Compute the policy loss
            # E_{s_t ~ D, eps_t ~ N} [log( pi( f(eps_t; s_t) ) ) - Q(s_t, f(eps_t; s_t))]
            if self.policy.is_recurrent:
                act_expl, log_probs, _ = self._expl_strat(
                    steps.observations, steps.hidden_states)
            else:
                act_expl, log_probs = self._expl_strat(steps.observations)
            q1_pi = self.q_fcn_1(to.cat([steps.observations, act_expl], dim=1))
            q2_pi = self.q_fcn_2(to.cat([steps.observations, act_expl], dim=1))
            min_q_pi = to.min(q1_pi, q2_pi)
            policy_loss = to.mean(self.alpha * log_probs - min_q_pi)
            policy_losses[b] = policy_loss.data
            expl_strat_stds[b] = to.mean(self._expl_strat.std.data)

            # Do one optimization step for each optimizer, and clip the gradients if desired
            # Q-fcn 1
            self._optim_q_fcn_1.zero_grad()
            q_1_loss.backward()
            q_fcn_1_grad_norm[b] = self.clip_grad(self.q_fcn_1, None)
            self._optim_q_fcn_1.step()
            # Q-fcn 2
            self._optim_q_fcn_2.zero_grad()
            q_2_loss.backward()
            q_fcn_2_grad_norm[b] = self.clip_grad(self.q_fcn_2, None)
            self._optim_q_fcn_2.step()
            # Policy
            self._optim_policy.zero_grad()
            policy_loss.backward()
            policy_grad_norm[b] = self.clip_grad(self._expl_strat.policy,
                                                 self.max_grad_norm)
            self._optim_policy.step()

            if self.learn_alpha:
                # Compute entropy coefficient loss
                alpha_loss = -to.mean(
                    self._log_alpha *
                    (log_probs.detach() + self.target_entropy))
                # Do one optimizer step for the entropy coefficient optimizer
                self._alpha_optim.zero_grad()
                alpha_loss.backward()
                self._alpha_optim.step()

            # Soft-update the target networks
            if (self._curr_iter * self.num_batch_updates +
                    b) % self.target_update_intvl == 0:
                SAC.soft_update(self.q_targ_1, self.q_fcn_1, self.tau)
                SAC.soft_update(self.q_targ_2, self.q_fcn_2, self.tau)

        # Update the learning rate if the schedulers have been specified
        if self._lr_scheduler_policy is not None:
            self._lr_scheduler_policy.step()
            self._lr_scheduler_q_fcn_1.step()
            self._lr_scheduler_q_fcn_2.step()

        # Logging
        self.logger.add_value('Q1 loss', to.mean(q_fcn_1_losses).item())
        self.logger.add_value('Q2 loss', to.mean(q_fcn_2_losses).item())
        self.logger.add_value('policy loss', to.mean(policy_losses).item())
        self.logger.add_value('avg policy grad norm',
                              to.mean(policy_grad_norm).item())
        self.logger.add_value('avg expl strat std',
                              to.mean(expl_strat_stds).item())
        self.logger.add_value('alpha', self.alpha.item())
        if self._lr_scheduler_policy is not None:
            self.logger.add_value('learning rate',
                                  self._lr_scheduler_policy.get_lr())

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This instance is not a subroutine of a meta-algorithm
            joblib.dump(self._env, osp.join(self._save_dir, 'env.pkl'))
            to.save(self.q_targ_1, osp.join(self._save_dir, 'target1.pt'))
            to.save(self.q_targ_2, osp.join(self._save_dir, 'target2.pt'))
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                to.save(
                    self.q_targ_1,
                    osp.join(
                        self._save_dir,
                        f"{meta_info['prefix']}_target1_{meta_info['suffix']}.pt"
                    ))
                to.save(
                    self.q_targ_2,
                    osp.join(
                        self._save_dir,
                        f"{meta_info['prefix']}_target2_{meta_info['suffix']}.pt"
                    ))
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                to.save(
                    self.q_targ_1,
                    osp.join(self._save_dir,
                             f"{meta_info['prefix']}_target1.pt"))
                to.save(
                    self.q_targ_2,
                    osp.join(self._save_dir,
                             f"{meta_info['prefix']}_target2.pt"))
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                to.save(
                    self.q_targ_1,
                    osp.join(self._save_dir,
                             f"target1_{meta_info['suffix']}.pt"))
                to.save(
                    self.q_targ_2,
                    osp.join(self._save_dir,
                             f"target2_{meta_info['suffix']}.pt"))
            else:
                raise NotImplementedError

    def load_snapshot(self, load_dir: str = None, meta_info: dict = None):
        # Get the directory to load from
        ld = load_dir if load_dir is not None else self._save_dir
        super().load_snapshot(ld, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of a meta-algorithm
            self._env = joblib.load(osp.join(ld, 'env.pkl'))
            self.q_targ_1.load_state_dict(
                to.load(osp.join(ld, 'target1.pt')).state_dict())
            self.q_targ_2.load_state_dict(
                to.load(osp.join(ld, 'target2.pt')).state_dict())
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                self.q_targ_1.load_state_dict(
                    to.load(
                        osp.join(
                            ld,
                            f"{meta_info['prefix']}_target1_{meta_info['suffix']}.pt"
                        )).state_dict())
                self.q_targ_2.load_state_dict(
                    to.load(
                        osp.join(
                            ld,
                            f"{meta_info['prefix']}_target2_{meta_info['suffix']}.pt"
                        )).state_dict())
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                self.q_targ_1.load_state_dict(
                    to.load(osp.join(
                        ld, f"{meta_info['prefix']}_target1.pt")).state_dict())
                self.q_targ_2.load_state_dict(
                    to.load(osp.join(
                        ld, f"{meta_info['prefix']}_target2.pt")).state_dict())
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                self.q_targ_1.load_state_dict(
                    to.load(osp.join(
                        ld, f"target1_{meta_info['suffix']}.pt")).state_dict())
                self.q_targ_2.load_state_dict(
                    to.load(osp.join(
                        ld, f"target2_{meta_info['suffix']}.pt")).state_dict())
            else:
                raise NotImplementedError

    def reset(self, seed: int = None):
        # Reset the exploration strategy, internal variables and the random seeds
        super().reset(seed)

        # Re-initialize sampler in case env or policy changed
        self.sampler.reinit()

        # Reset the replay memory
        self._memory.reset()

        # Reset the learning rate schedulers
        if self._lr_scheduler_policy is not None:
            self._lr_scheduler_policy.last_epoch = -1
        if self._lr_scheduler_q_fcn_1 is not None:
            self._lr_scheduler_q_fcn_1.last_epoch = -1
        if self._lr_scheduler_q_fcn_2 is not None:
            self._lr_scheduler_q_fcn_2.last_epoch = -1
Пример #14
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 value_fcn_coeff: float = 0.5,
                 entropy_coeff: float = 1e-3,
                 batch_size: int = 32,
                 std_init: float = 1.0,
                 max_grad_norm: float = None,
                 num_sampler_envs: int = 4,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param value_fcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2
        :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param num_sampler_envs: number of environments for parallel sampling
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm
        """
        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.min_rollouts = min_rollouts
        self.min_steps = min_steps
        self.value_fcn_coeff = value_fcn_coeff
        self.entropy_coeff = entropy_coeff
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = NormalActNoiseExplStrat(self._policy,
                                                   std_init=std_init)
        self.sampler = ParallelSampler(env,
                                       self.expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=min_steps,
                                       min_rollouts=min_rollouts)
        self.optim = to.optim.RMSprop(
            [{
                'params': self._policy.parameters()
            }, {
                'params': self.expl_strat.noise.parameters()
            }, {
                'params': self._critic.value_fcn.parameters()
            }],
            lr=lr,
            eps=1e-5)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)
Пример #15
0
    plt.show()


if __name__ == '__main__':
    # Set up environment
    dp_gt = dict(m=2., k=20., d=0.8)  # ground truth
    dp_init = dict(m=1.0, k=24., d=0.4)  # initial guess
    dt = 1/50.
    env = OneMassOscillatorSim(dt=dt, max_steps=400)
    env.reset(domain_param=dp_gt)

    # Set up policy
    policy = DummyPolicy(env.spec)

    # Sample
    sampler = ParallelSampler(env, policy, num_envs=1, min_rollouts=50, seed=1)
    ros = sampler.sample()

    # Pyro
    pyro.set_rng_seed(1001)
    pyro.enable_validation(True)

    train(
        SVI(model=model,
            guide=guide,
            optim=optim.Adam({'lr': 0.01}),
            # optim=optim.SGD({'lr': 0.001, 'momentum': 0.1}),
            loss=Trace_ELBO()),
        rollouts=ros, prior=dp_init
    )
Пример #16
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: DiscrActQValFNNPolicy,
                 memory_size: int,
                 eps_init: float,
                 eps_schedule_gamma: float,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 target_update_intvl: int = 5,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 batch_size: int = 256,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 0.5,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: environment which the policy operates
        :param policy: (current) Q-network updated by this algorithm
        :param memory_size: number of transitions in the replay memory buffer
        :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma==1`
        :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, DiscrActQValFNNPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=DiscrActQValFNNPolicy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.target = deepcopy(self._policy)
        self.target.eval()  # will not be trained using the optimizer
        self._memory_size = memory_size
        self.eps = eps_init
        self.gamma = gamma
        self.target_update_intvl = target_update_intvl
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init, eps_schedule_gamma)
        self._memory = ReplayMemory(memory_size)
        self.sampler = ParallelSampler(
            env, self._expl_strat,
            num_envs=1,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.sampler_eval = ParallelSampler(
            env, self._policy,
            num_envs=num_sampler_envs,
            min_steps=100*env.max_steps,
            min_rollouts=None
        )
        self.optim = to.optim.RMSprop([{'params': self._policy.parameters()}], lr=lr)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
Пример #17
0
class DQL(Algorithm):
    """
    Deep Q-Learning (without bells and whistles)

    .. seealso::
        [1] V. Mnih et.al., "Human-level control through deep reinforcement learning", Nature, 2015
    """

    name: str = 'dql'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: DiscrActQValFNNPolicy,
                 memory_size: int,
                 eps_init: float,
                 eps_schedule_gamma: float,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 target_update_intvl: int = 5,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 batch_size: int = 256,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = 0.5,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: environment which the policy operates
        :param policy: (current) Q-network updated by this algorithm
        :param memory_size: number of transitions in the replay memory buffer
        :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma==1`
        :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, DiscrActQValFNNPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=DiscrActQValFNNPolicy)

        if logger is None:
            # Create logger that only logs every 100 steps of the algorithm
            logger = StepLogger(print_interval=100)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(CSVPrinter(osp.join(save_dir, 'progress.csv')))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.target = deepcopy(self._policy)
        self.target.eval()  # will not be trained using the optimizer
        self._memory_size = memory_size
        self.eps = eps_init
        self.gamma = gamma
        self.target_update_intvl = target_update_intvl
        self.num_batch_updates = num_batch_updates
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init, eps_schedule_gamma)
        self._memory = ReplayMemory(memory_size)
        self.sampler = ParallelSampler(
            env, self._expl_strat,
            num_envs=1,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.sampler_eval = ParallelSampler(
            env, self._policy,
            num_envs=num_sampler_envs,
            min_steps=100*env.max_steps,
            min_rollouts=None
        )
        self.optim = to.optim.RMSprop([{'params': self._policy.parameters()}], lr=lr)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)

    @property
    def expl_strat(self) -> EpsGreedyExplStrat:
        return self._expl_strat

    @property
    def memory(self) -> ReplayMemory:
        """ Get the replay memory. """
        return self._memory

    def step(self, snapshot_mode: str, meta_info: dict = None):
        # Sample steps and store them in the replay memory
        ros = self.sampler.sample()
        self._memory.push(ros)

        while len(self._memory) < self.memory.capacity:
            # Warm-up phase
            print_cbt('Collecting samples until replay memory contains if full.', 'w')
            # Sample steps and store them in the replay memory
            ros = self.sampler.sample()
            self._memory.push(ros)

        # Log return-based metrics
        if self._curr_iter % self.logger.print_interval == 0:
            ros = self.sampler_eval.sample()
            rets = [ro.undiscounted_return() for ro in ros]
            ret_max = np.max(rets)
            ret_med = np.median(rets)
            ret_avg = np.mean(rets)
            ret_min = np.min(rets)
            ret_std = np.std(rets)
        else:
            ret_max, ret_med, ret_avg, ret_min, ret_std = 5*[-pyrado.inf]  # dummy values
        self.logger.add_value('max return', np.round(ret_max, 4))
        self.logger.add_value('median return', np.round(ret_med, 4))
        self.logger.add_value('avg return', np.round(ret_avg, 4))
        self.logger.add_value('min return', np.round(ret_min, 4))
        self.logger.add_value('std return', np.round(ret_std, 4))
        self.logger.add_value('avg rollout length', np.round(np.mean([ro.length for ro in ros]), 2))
        self.logger.add_value('num rollouts', len(ros))
        self.logger.add_value('avg memory reward', np.round(self._memory.avg_reward(), 4))

        # Use data in the memory to update the policy and the target Q-function
        self.update()

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)

    def loss_fcn(self, q_vals: to.Tensor, expected_q_vals: to.Tensor) -> to.Tensor:
        r"""
        The Huber loss function on the one-step TD error $\delta = Q(s,a) - (r + \gamma \max_a Q(s^\prime, a))$.

        :param q_vals: state-action values $Q(s,a)$, from policy network
        :param expected_q_vals: expected state-action values $r + \gamma \max_a Q(s^\prime, a)$, from target network
        :return: loss value
        """
        return nn.functional.smooth_l1_loss(q_vals, expected_q_vals)

    def update(self):
        """ Update the policy's and target Q-function's parameters on transitions sampled from the replay memory. """
        losses = to.zeros(self.num_batch_updates)
        policy_grad_norm = to.zeros(self.num_batch_updates)

        for b in tqdm(range(self.num_batch_updates), total=self.num_batch_updates,
                      desc=f'Updating', unit='batches', file=sys.stdout, leave=False):

            # Sample steps and the associated next step from the replay memory
            steps, next_steps = self._memory.sample(self.batch_size)
            steps.torch(data_type=to.get_default_dtype())
            next_steps.torch(data_type=to.get_default_dtype())

            # Create masks for the non-final observations
            not_done = to.tensor(1. - steps.done, dtype=to.get_default_dtype())

            # Compute the state-action values Q(s,a) using the current DQN policy
            q_vals = self.expl_strat.policy.q_values_chosen(steps.observations)

            # Compute the second term of TD-error
            next_v_vals = self.target.q_values_chosen(next_steps.observations).detach()
            expected_q_val = steps.rewards + not_done*self.gamma*next_v_vals

            # Compute the loss, clip the gradients if desired, and do one optimization step
            loss = self.loss_fcn(q_vals, expected_q_val)
            losses[b] = loss.data
            self.optim.zero_grad()
            loss.backward()
            policy_grad_norm[b] = self.clip_grad(self.expl_strat.policy, self.max_grad_norm)
            self.optim.step()

            # Update the target network by copying all weights and biases from the DQN policy
            if (self._curr_iter*self.num_batch_updates + b)%self.target_update_intvl == 0:
                self.target.load_state_dict(self.expl_strat.policy.state_dict())

        # Schedule the exploration parameter epsilon
        self.expl_strat.schedule_eps(self._curr_iter)

        # Update the learning rate if a scheduler has been specified
        if self._lr_scheduler is not None:
            self._lr_scheduler.step()

        # Logging
        with to.no_grad():
            self.logger.add_value('loss after', to.mean(losses).item())
        self.logger.add_value('expl strat eps', self.expl_strat.eps.item())
        self.logger.add_value('avg policy grad norm', to.mean(policy_grad_norm).item())
        if self._lr_scheduler is not None:
            self.logger.add_value('learning rate', self._lr_scheduler.get_lr())

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This instance is not a subroutine of a meta-algorithm
            joblib.dump(self._env, osp.join(self._save_dir, 'env.pkl'))
            to.save(self.target, osp.join(self._save_dir, 'target.pt'))
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                to.save(self.target,
                        osp.join(self._save_dir, f"{meta_info['prefix']}_target_{meta_info['suffix']}.pt"))
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                to.save(self.target, osp.join(self._save_dir, f"{meta_info['prefix']}_target.pt"))
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                to.save(self.target, osp.join(self._save_dir, f"target_{meta_info['suffix']}.pt"))
            else:
                raise NotImplementedError

    def load_snapshot(self, load_dir: str = None, meta_info: dict = None):
        # Get the directory to load from
        ld = load_dir if load_dir is not None else self._save_dir
        super().load_snapshot(ld, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of a meta-algorithm
            self._env = joblib.load(osp.join(ld, 'env.pkl'))
            self.target.load_state_dict(to.load(osp.join(ld, 'target.pt')).state_dict())
        else:
            # This algorithm instance is a subroutine of a meta-algorithm
            if 'prefix' in meta_info and 'suffix' in meta_info:
                self.target.load_state_dict(
                    to.load(osp.join(ld, f"{meta_info['prefix']}_target_{meta_info['suffix']}.pt")).state_dict()
                )
            elif 'prefix' in meta_info and 'suffix' not in meta_info:
                self.target.load_state_dict(
                    to.load(osp.join(ld, f"{meta_info['prefix']}_target.pt")).state_dict()
                )
            elif 'prefix' not in meta_info and 'suffix' in meta_info:
                self.target.load_state_dict(
                    to.load(osp.join(ld, f"target_{meta_info['suffix']}.pt")).state_dict()
                )
            else:
                raise NotImplementedError

    def reset(self, seed: int = None):
        # Reset the exploration strategy, internal variables and the random seeds
        super().reset(seed)

        # Re-initialize sampler in case env or policy changed
        self.sampler.reinit()

        # Reset the replay memory
        self._memory.reset()

        # Reset the learning rate scheduler
        if self._lr_scheduler is not None:
            self._lr_scheduler.last_epoch = -1