def test_cuda_sampling_w_dr(default_bob, bob_pert): # Add randomizer env = DomainRandWrapperLive(default_bob, bob_pert) # Use a simple policy policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh, use_cuda=True) # Create the sampler sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=10) samples = sampler.sample() assert samples is not None
def __init__(self, save_dir: str, env: Env, policy: Policy, min_rollouts: int = None, min_steps: int = None, num_sampler_envs: int = 4, logger: StepLogger = None, sampler: SamplerBase = None, ball_z_dim_mismatch: bool = True): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy which this algorithm is creating :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim, set to True if the controller does not have the z component (relative position) of the ball in the state vector, i.e. state is 14-dim instead of 16-dim """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(policy, LinearPolicy): raise pyrado.TypeErr(given=policy, expected_type=LinearPolicy) # Call Algorithm's constructor super().__init__(save_dir, 1, policy, logger) # Store the inputs self._env = env self.ball_z_dim_mismatch = ball_z_dim_mismatch # Initialize variables for checking and evaluating if sampler is None: sampler = ParallelSampler(env, self._policy, num_envs=num_sampler_envs, min_steps=min_steps, min_rollouts=min_rollouts) self.sampler = sampler self.eigvals = np.array([pyrado.inf]) # initialize with sth positive
def test_adr_reward_generator(env): reference_env = env random_env = deepcopy(env) reward_generator = RewardGenerator( env_spec=random_env.spec, batch_size=100, reward_multiplier=1, logger=None ) policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh) dr = get_default_randomizer_omo() dr.randomize(num_samples=1) random_env.domain_param = dr.get_params(format='dict', dtype='numpy') reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000) random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000) losses = [] for i in range(50): reference_traj = StepSequence.concat(reference_sampler.sample()) random_traj = StepSequence.concat(random_sampler.sample()) losses.append(reward_generator.train(reference_traj, random_traj, 10)) assert losses[len(losses) - 1] < losses[0]
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1/250., max_steps=1500) env = ActNormWrapper(env) # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # Critic value_fcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) critic_hparam = dict( gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), batch_size=100, lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) critic = GAE(value_fcn, **critic_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=500, min_steps=25*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), batch_size=100, std_init=0.9, lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret
def __init__(self, save_dir: str, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, num_epoch: int = 3, eps_clip: float = 0.1, batch_size: int = 64, std_init: float = 1.0, num_sampler_envs: int = 4, max_grad_norm: float = None, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: number of iterations (policy updates) :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_epoch: number of iterations over all gathered samples during one policy update :param eps_clip: max/min probability ratio, see [1] :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param num_sampler_envs: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created .. note:: The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler schedules the maximum learning rate. """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) assert isinstance(policy, Policy) # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) # Store the inputs self.num_epoch = num_epoch self.eps_clip = eps_clip self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self.log_loss = True self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.sampler = ParallelSampler(env, self._expl_strat, num_envs=num_sampler_envs, min_steps=min_steps, min_rollouts=min_rollouts) self.optim = to.optim.Adam( [{ 'params': self._expl_strat.policy.parameters() }, { 'params': self._expl_strat.noise.parameters() }], lr=lr, eps=1e-5) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 100., max_steps=600) env = QQubeSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( 'shared_hidden_sizes_policy', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical('shared_hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic q_fcn_hparam = dict( hidden_sizes=trial.suggest_categorical( 'hidden_sizes_critic', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( 'min_steps_algo', [1]), # , 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform('memory_size_algo', 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform('tau_algo', 0.99, 1.), alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9), learn_alpha=trial.suggest_categorical('learn_alpha_algo', [True, False]), standardize_rew=trial.suggest_categorical('standardize_rew_algo', [False]), gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.), target_update_intvl=trial.suggest_categorical( 'target_update_intvl_algo', [1, 5]), num_batch_updates=trial.suggest_categorical('num_batch_updates_algo', [1, 5]), batch_size=trial.suggest_categorical('batch_size_algo', [128, 256, 512]), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(ex_dir, f'trial_{trial.number}')) algo = SAC(ex_dir, env, policy, q_fcn_1, q_fcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler( env, policy, num_envs=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def eval_policy(save_dir: [str, None], env_real: [RealEnv, SimEnv, MetaDomainRandWrapper], policy: Policy, montecarlo_estimator: bool, prefix: str, num_rollouts: int) -> to.Tensor: """ Evaluate a policy on the target system (real-world platform). This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env_real: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param montecarlo_estimator: estimate the return with a sample average (`True`) or a lower confidence bound (`False`) obtained from bootrapping :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :return: estimated return in the target domain """ if isinstance(env_real, RealEnv): input('Evaluating in the target domain. Hit any key to continue.') if save_dir is not None: print_cbt(f'Evaluating {prefix}_policy on the target system ...', 'c', bright=True) rets_real = to.zeros(num_rollouts) if isinstance(env_real, RealEnv): # Evaluate sequentially when conducting a sim-to-real experiment for i in range(num_rollouts): rets_real[i] = rollout(env_real, policy, eval=True, no_close=False).undiscounted_return() elif isinstance(env_real, (SimEnv, MetaDomainRandWrapper)): # Create a parallel sampler when conducting a sim-to-sim experiment sampler = ParallelSampler(env_real, policy, num_envs=1, min_rollouts=num_rollouts) ros = sampler.sample() for i in range(num_rollouts): rets_real[i] = ros[i].undiscounted_return() else: raise pyrado.TypeErr( given=env_real, expected_type=[RealEnv, SimEnv, MetaDomainRandWrapper]) if save_dir is not None: # Save the evaluation results to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt')) print_cbt('target domain performance', bright=True) print( tabulate([['mean return', to.mean(rets_real).item()], ['std return', to.std(rets_real)], ['min return', to.min(rets_real)], ['max return', to.max(rets_real)]])) if montecarlo_estimator: return to.mean(rets_real) else: return to.from_numpy( bootstrap_ci(rets_real.numpy(), np.mean, num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
def __init__(self, save_dir: str, env: Env, particle_hparam: dict, max_iter: int, num_particles: int, temperature: float, lr: float, horizon: int, std_init: float = 1.0, min_rollouts: int = None, min_steps: int = 10000, num_sampler_envs: int = 4, serial: bool = True, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param particle_hparam: hyper-parameters for particle template construction :param max_iter: number of iterations :param num_particles: number of distinct particles :param temperature: the temperature of the SVGD determines how jointly the training takes place :param lr: the learning rate for the update of the particles :param horizon: horizon for each particle :param std_init: initial standard deviation for the exploration :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside :param logger: logger for every step of the algorithm """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(particle_hparam, dict): raise pyrado.TypeErr(given=particle_hparam, expected_type=dict) if not all([ key in particle_hparam for key in ['actor', 'value_fcn', 'critic'] ]): raise AttributeError # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy=None, logger=logger) # Store the inputs self._env = env self.num_particles = num_particles self.horizon = horizon # TODO @Robin: where is the horizon used?! self.lr = lr self.temperature = temperature self.serial = serial # Prepare placeholders for particles self.particles = [None] * num_particles self.expl_strats = [None] * num_particles self.optimizers = [None] * num_particles self.fixed_particles = [None] * num_particles self.fixed_expl_strats = [None] * num_particles self.samplers = [None] * num_particles self.count = 0 self.updatecount = 0 # Particle factory actor = FNNPolicy(spec=env.spec, **particle_hparam['actor']) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam['value_fcn']) critic = GAE(value_fcn, **particle_hparam['critic']) particle = SVPGParticle(env.spec, actor, critic) for i in range(self.num_particles): self.particles[i] = deepcopy(particle) self.particles[i].init_param() self.expl_strats[i] = NormalActNoiseExplStrat( self.particles[i].actor, std_init) self.optimizers[i] = to.optim.Adam( self.expl_strats[i].parameters(), lr=self.lr) self.fixed_particles[i] = deepcopy(self.particles[i]) self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i]) if self.serial: self.samplers[i] = ParallelSampler(env, self.expl_strats[i], num_sampler_envs, min_rollouts=min_rollouts, min_steps=min_steps)
def __init__(self, save_dir: str, env: [SimEnv, StateAugmentationWrapper], subroutine: Algorithm, policy: Policy, expl_strat: StochasticActionExplStrat, max_iter: int, num_rollouts: int = None, steps_num: int = None, apply_dynamics_noise: bool = False, dyn_eps: float = 0.01, dyn_phi: float = 0.1, halfspan: float = 0.25, apply_proccess_noise: bool = False, proc_eps: float = 0.01, proc_phi: float = 0.05, apply_observation_noise: bool = False, obs_eps: float = 0.01, obs_phi: float = 0.05, torch_observation: bool = True, base_seed: int = None, num_sampler_envs: int = 4, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment in which the agent should be trained :param subroutine: algorithm which performs the policy / value-function optimization :param policy: policy to be updated :param expl_strat: the exploration strategy :param max_iter: the maximum number of iterations :param num_rollouts: the number of rollouts to be performed for each update step :param steps_num: the number of steps to be performed for each update step :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied :param dyn_eps: the intensity of generated dynamics noise :param dyn_phi: the probability of applying dynamics noise :param halfspan: the halfspan of the uniform random distribution used to sample :param apply_proccess_noise: whether adversarially generated process noise should be applied :param proc_eps: the intensity of generated process noise :param proc_phi: the probability of applying process noise :param apply_observation_noise: whether adversarially generated observation noise should be applied :param obs_eps: the intensity of generated observation noise :param obs_phi: the probability of applying observation noise :param torch_observation: a function to provide a differentiable observation :param base_seed: the random seed :param num_sampler_envs: number of environments for parallel sampling :param logger: the logger """ assert isinstance(subroutine, Algorithm) assert isinstance(max_iter, int) and max_iter > 0 super().__init__(save_dir, max_iter, policy, logger) # Get the randomized environment (recommended to make it the most outer one in the chain) # Initialize adversarial wrappers if apply_dynamics_noise: assert isinstance(env, StateAugmentationWrapper) env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps, dyn_phi, halfspan) if apply_proccess_noise: env = AdversarialStateWrapper(env, self.policy, proc_eps, proc_phi, torch_observation=torch_observation) if apply_observation_noise: env = AdversarialObservationWrapper(env, self.policy, obs_eps, obs_phi) self.num_rollouts = num_rollouts self.sampler = ParallelSampler(env, expl_strat, num_envs=num_sampler_envs, min_steps=steps_num, min_rollouts=num_rollouts, seed=base_seed) self._subroutine = subroutine
class ARPL(Algorithm): """ Adversarially Robust Policy Learning (ARPL) .. seealso:: A. Mandlekar, Y. Zhu, A. Garg, L. Fei-Fei, S. Savarese, "Adversarially Robust Policy Learning: Active Construction of Physically-Plausible Perturbations", IROS, 2017 """ name: str = 'arpl' def __init__(self, save_dir: str, env: [SimEnv, StateAugmentationWrapper], subroutine: Algorithm, policy: Policy, expl_strat: StochasticActionExplStrat, max_iter: int, num_rollouts: int = None, steps_num: int = None, apply_dynamics_noise: bool = False, dyn_eps: float = 0.01, dyn_phi: float = 0.1, halfspan: float = 0.25, apply_proccess_noise: bool = False, proc_eps: float = 0.01, proc_phi: float = 0.05, apply_observation_noise: bool = False, obs_eps: float = 0.01, obs_phi: float = 0.05, torch_observation: bool = True, base_seed: int = None, num_sampler_envs: int = 4, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment in which the agent should be trained :param subroutine: algorithm which performs the policy / value-function optimization :param policy: policy to be updated :param expl_strat: the exploration strategy :param max_iter: the maximum number of iterations :param num_rollouts: the number of rollouts to be performed for each update step :param steps_num: the number of steps to be performed for each update step :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied :param dyn_eps: the intensity of generated dynamics noise :param dyn_phi: the probability of applying dynamics noise :param halfspan: the halfspan of the uniform random distribution used to sample :param apply_proccess_noise: whether adversarially generated process noise should be applied :param proc_eps: the intensity of generated process noise :param proc_phi: the probability of applying process noise :param apply_observation_noise: whether adversarially generated observation noise should be applied :param obs_eps: the intensity of generated observation noise :param obs_phi: the probability of applying observation noise :param torch_observation: a function to provide a differentiable observation :param base_seed: the random seed :param num_sampler_envs: number of environments for parallel sampling :param logger: the logger """ assert isinstance(subroutine, Algorithm) assert isinstance(max_iter, int) and max_iter > 0 super().__init__(save_dir, max_iter, policy, logger) # Get the randomized environment (recommended to make it the most outer one in the chain) # Initialize adversarial wrappers if apply_dynamics_noise: assert isinstance(env, StateAugmentationWrapper) env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps, dyn_phi, halfspan) if apply_proccess_noise: env = AdversarialStateWrapper(env, self.policy, proc_eps, proc_phi, torch_observation=torch_observation) if apply_observation_noise: env = AdversarialObservationWrapper(env, self.policy, obs_eps, obs_phi) self.num_rollouts = num_rollouts self.sampler = ParallelSampler(env, expl_strat, num_envs=num_sampler_envs, min_steps=steps_num, min_rollouts=num_rollouts, seed=base_seed) self._subroutine = subroutine def step(self, snapshot_mode: str, meta_info: dict = None): rollouts = self.sampler.sample() rets = [ro.undiscounted_return() for ro in rollouts] ret_avg = np.mean(rets) ret_med = np.median(rets) ret_std = np.std(rets) self.logger.add_value('num rollouts', len(rollouts)) self.logger.add_value('avg rollout len', np.mean([ro.length for ro in rollouts])) self.logger.add_value('avg return', ret_avg) self.logger.add_value('median return', ret_med) self.logger.add_value('std return', ret_std) # Sub-routine self._subroutine.update(rollouts) self._subroutine.logger.record_step() self._subroutine.make_snapshot(snapshot_mode, ret_avg.item())
def __init__(self, save_dir: str, env: Env, policy: TwoHeadedPolicy, q_fcn_1: Policy, q_fcn_2: Policy, memory_size: int, gamma: float, max_iter: int, num_batch_updates: int, tau: float = 0.995, alpha_init: float = 0.2, learn_alpha: bool = True, target_update_intvl: int = 1, standardize_rew: bool = True, batch_size: int = 500, min_rollouts: int = None, min_steps: int = None, num_sampler_envs: int = 4, max_grad_norm: float = 5., lr: float = 3e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param q_fcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a re-initialized copies of this one :param q_fcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a re-initialized copies of this one :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000 :param gamma: temporal discount factor for the state values :param max_iter: number of iterations (policy updates) :param num_batch_updates: number of batch updates per algorithm steps :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak update, between 0 and 1 :param alpha_init: initial weighting factor of the entropy term in the loss function :param learn_alpha: adapt the weighting factor of the entropy term :param target_update_intvl: number of iterations that pass before updating the target network :param standardize_rew: bool to flag if the rewards should be standardized :param batch_size: number of samples per policy update batch :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step per `update()` call :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if typed_env(env, ActNormWrapper) is None: raise pyrado.TypeErr( msg='SAC required an environment wrapped by an ActNormWrapper!' ) if not isinstance(q_fcn_1, Policy): raise pyrado.TypeErr(given=q_fcn_1, expected_type=Policy) if not isinstance(q_fcn_2, Policy): raise pyrado.TypeErr(given=q_fcn_2, expected_type=Policy) if logger is None: # Create logger that only logs every 100 steps of the algorithm logger = StepLogger(print_interval=100) logger.printers.append(ConsolePrinter()) logger.printers.append( CSVPrinter(osp.join(save_dir, 'progress.csv'))) # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Store the inputs self._env = env self.q_fcn_1 = q_fcn_1 self.q_fcn_2 = q_fcn_2 self.q_targ_1 = deepcopy(self.q_fcn_1) self.q_targ_2 = deepcopy(self.q_fcn_2) self.q_targ_1.eval() self.q_targ_2.eval() self.gamma = gamma self.tau = tau self.learn_alpha = learn_alpha self.target_update_intvl = target_update_intvl self.standardize_rew = standardize_rew self.num_batch_updates = num_batch_updates self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self._memory = ReplayMemory(memory_size) if policy.is_recurrent: init_expl_policy = RecurrentDummyPolicy(env.spec, policy.hidden_size) else: init_expl_policy = DummyPolicy(env.spec) self.sampler_init = ParallelSampler( env, init_expl_policy, # samples uniformly random from the action space num_envs=num_sampler_envs, min_steps=memory_size, ) self._expl_strat = SACExplStrat( self._policy, std_init=1.) # std_init will be overwritten by 2nd policy head self.sampler = ParallelSampler( env, self._expl_strat, num_envs=1, min_steps=min_steps, # in [2] this would be 1 min_rollouts=min_rollouts # in [2] this would be None ) self.sampler_eval = ParallelSampler(env, self._policy, num_envs=num_sampler_envs, min_steps=100 * env.max_steps, min_rollouts=None) self._optim_policy = to.optim.Adam([{ 'params': self._policy.parameters() }], lr=lr) self._optim_q_fcn_1 = to.optim.Adam( [{ 'params': self.q_fcn_1.parameters() }], lr=lr) self._optim_q_fcn_2 = to.optim.Adam( [{ 'params': self.q_fcn_2.parameters() }], lr=lr) log_alpha_init = to.log( to.tensor(alpha_init, dtype=to.get_default_dtype())) if learn_alpha: # Automatic entropy tuning self._log_alpha = nn.Parameter(log_alpha_init, requires_grad=True) self._alpha_optim = to.optim.Adam([{ 'params': self._log_alpha }], lr=lr) self.target_entropy = -to.prod(to.tensor(env.act_space.shape)) else: self._log_alpha = log_alpha_init self._lr_scheduler_policy = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler_policy = lr_scheduler(self._optim_policy, **lr_scheduler_hparam) self._lr_scheduler_q_fcn_1 = lr_scheduler(self._optim_q_fcn_1, **lr_scheduler_hparam) self._lr_scheduler_q_fcn_2 = lr_scheduler(self._optim_q_fcn_2, **lr_scheduler_hparam)
""" Script to sample some rollouts using the ParallelSampler """ from tabulate import tabulate from pyrado.environment_wrappers.action_normalization import ActNormWrapper from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim from pyrado.policies.features import FeatureStack, identity_feat, squared_feat from pyrado.policies.linear import LinearPolicy from pyrado.sampling.parallel_sampler import ParallelSampler if __name__ == '__main__': # Set up environment env = BallOnBeamSim(dt=0.02, max_steps=500) env = ActNormWrapper(env) # Set up policy feats = FeatureStack([identity_feat, squared_feat]) policy = LinearPolicy(env.spec, feats) # Set up sampler sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=2000) # Sample and print ros = sampler.sample() print( tabulate({ 'StepSequence count': len(ros), 'Step count': sum(map(len, ros)), }.items()))
class SAC(Algorithm): """ Soft Actor-Critic (SAC) variant with stochastic policy and two Q-functions and two Q-targets (no V-function) .. seealso:: [1] T. Haarnoja, A. Zhou, P. Abbeel, S. Levine, "Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor", ICML, 2018 [2] This implementation was inspired by https://github.com/pranz24/pytorch-soft-actor-critic which is seems to be based on https://github.com/vitchyr/rlkit """ name: str = 'sac' def __init__(self, save_dir: str, env: Env, policy: TwoHeadedPolicy, q_fcn_1: Policy, q_fcn_2: Policy, memory_size: int, gamma: float, max_iter: int, num_batch_updates: int, tau: float = 0.995, alpha_init: float = 0.2, learn_alpha: bool = True, target_update_intvl: int = 1, standardize_rew: bool = True, batch_size: int = 500, min_rollouts: int = None, min_steps: int = None, num_sampler_envs: int = 4, max_grad_norm: float = 5., lr: float = 3e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param q_fcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a re-initialized copies of this one :param q_fcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a re-initialized copies of this one :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000 :param gamma: temporal discount factor for the state values :param max_iter: number of iterations (policy updates) :param num_batch_updates: number of batch updates per algorithm steps :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak update, between 0 and 1 :param alpha_init: initial weighting factor of the entropy term in the loss function :param learn_alpha: adapt the weighting factor of the entropy term :param target_update_intvl: number of iterations that pass before updating the target network :param standardize_rew: bool to flag if the rewards should be standardized :param batch_size: number of samples per policy update batch :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step per `update()` call :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if typed_env(env, ActNormWrapper) is None: raise pyrado.TypeErr( msg='SAC required an environment wrapped by an ActNormWrapper!' ) if not isinstance(q_fcn_1, Policy): raise pyrado.TypeErr(given=q_fcn_1, expected_type=Policy) if not isinstance(q_fcn_2, Policy): raise pyrado.TypeErr(given=q_fcn_2, expected_type=Policy) if logger is None: # Create logger that only logs every 100 steps of the algorithm logger = StepLogger(print_interval=100) logger.printers.append(ConsolePrinter()) logger.printers.append( CSVPrinter(osp.join(save_dir, 'progress.csv'))) # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Store the inputs self._env = env self.q_fcn_1 = q_fcn_1 self.q_fcn_2 = q_fcn_2 self.q_targ_1 = deepcopy(self.q_fcn_1) self.q_targ_2 = deepcopy(self.q_fcn_2) self.q_targ_1.eval() self.q_targ_2.eval() self.gamma = gamma self.tau = tau self.learn_alpha = learn_alpha self.target_update_intvl = target_update_intvl self.standardize_rew = standardize_rew self.num_batch_updates = num_batch_updates self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self._memory = ReplayMemory(memory_size) if policy.is_recurrent: init_expl_policy = RecurrentDummyPolicy(env.spec, policy.hidden_size) else: init_expl_policy = DummyPolicy(env.spec) self.sampler_init = ParallelSampler( env, init_expl_policy, # samples uniformly random from the action space num_envs=num_sampler_envs, min_steps=memory_size, ) self._expl_strat = SACExplStrat( self._policy, std_init=1.) # std_init will be overwritten by 2nd policy head self.sampler = ParallelSampler( env, self._expl_strat, num_envs=1, min_steps=min_steps, # in [2] this would be 1 min_rollouts=min_rollouts # in [2] this would be None ) self.sampler_eval = ParallelSampler(env, self._policy, num_envs=num_sampler_envs, min_steps=100 * env.max_steps, min_rollouts=None) self._optim_policy = to.optim.Adam([{ 'params': self._policy.parameters() }], lr=lr) self._optim_q_fcn_1 = to.optim.Adam( [{ 'params': self.q_fcn_1.parameters() }], lr=lr) self._optim_q_fcn_2 = to.optim.Adam( [{ 'params': self.q_fcn_2.parameters() }], lr=lr) log_alpha_init = to.log( to.tensor(alpha_init, dtype=to.get_default_dtype())) if learn_alpha: # Automatic entropy tuning self._log_alpha = nn.Parameter(log_alpha_init, requires_grad=True) self._alpha_optim = to.optim.Adam([{ 'params': self._log_alpha }], lr=lr) self.target_entropy = -to.prod(to.tensor(env.act_space.shape)) else: self._log_alpha = log_alpha_init self._lr_scheduler_policy = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler_policy = lr_scheduler(self._optim_policy, **lr_scheduler_hparam) self._lr_scheduler_q_fcn_1 = lr_scheduler(self._optim_q_fcn_1, **lr_scheduler_hparam) self._lr_scheduler_q_fcn_2 = lr_scheduler(self._optim_q_fcn_2, **lr_scheduler_hparam) @property def expl_strat(self) -> SACExplStrat: return self._expl_strat @property def memory(self) -> ReplayMemory: """ Get the replay memory. """ return self._memory @property def alpha(self) -> to.Tensor: """ Get the detached entropy coefficient. """ return to.exp(self._log_alpha.detach()) def step(self, snapshot_mode: str, meta_info: dict = None): if self._memory.isempty: # Warm-up phase print_cbt( 'Collecting samples until replay memory contains if full.', 'w') # Sample steps and store them in the replay memory ros = self.sampler_init.sample() self._memory.push(ros) else: # Sample steps and store them in the replay memory ros = self.sampler.sample() self._memory.push(ros) # Log return-based metrics if self._curr_iter % self.logger.print_interval == 0: ros = self.sampler_eval.sample() rets = [ro.undiscounted_return() for ro in ros] ret_max = np.max(rets) ret_med = np.median(rets) ret_avg = np.mean(rets) ret_min = np.min(rets) ret_std = np.std(rets) else: ret_max, ret_med, ret_avg, ret_min, ret_std = 5 * [ -pyrado.inf ] # dummy values self.logger.add_value('max return', np.round(ret_max, 4)) self.logger.add_value('median return', np.round(ret_med, 4)) self.logger.add_value('avg return', np.round(ret_avg, 4)) self.logger.add_value('min return', np.round(ret_min, 4)) self.logger.add_value('std return', np.round(ret_std, 4)) self.logger.add_value('avg rollout length', np.round(np.mean([ro.length for ro in ros]), 2)) self.logger.add_value('num rollouts', len(ros)) self.logger.add_value('avg memory reward', np.round(self._memory.avg_reward(), 4)) # Use data in the memory to update the policy and the Q-functions self.update() # Save snapshot data self.make_snapshot(snapshot_mode, float(ret_avg), meta_info) @staticmethod def soft_update(target: nn.Module, source: nn.Module, tau: float = 0.995): """ Moving average update, a.k.a. Polyak update. Modifies the input argument `target`. :param target: PyTroch module with parameters to be updated :param source: PyTroch module with parameters to update to :param tau: interpolation factor for averaging, between 0 and 1 """ if not 0 < tau < 1: raise pyrado.ValueErr(given=tau, g_constraint='0', l_constraint='1') for targ_param, src_param in zip(target.parameters(), source.parameters()): targ_param.data = targ_param.data * tau + src_param.data * (1. - tau) def update(self): """ Update the policy's and Q-functions' parameters on transitions sampled from the replay memory. """ # Containers for logging policy_losses = to.zeros(self.num_batch_updates) expl_strat_stds = to.zeros(self.num_batch_updates) q_fcn_1_losses = to.zeros(self.num_batch_updates) q_fcn_2_losses = to.zeros(self.num_batch_updates) policy_grad_norm = to.zeros(self.num_batch_updates) q_fcn_1_grad_norm = to.zeros(self.num_batch_updates) q_fcn_2_grad_norm = to.zeros(self.num_batch_updates) for b in tqdm(range(self.num_batch_updates), total=self.num_batch_updates, desc=f'Updating', unit='batches', file=sys.stdout, leave=False): # Sample steps and the associated next step from the replay memory steps, next_steps = self._memory.sample(self.batch_size) steps.torch(data_type=to.get_default_dtype()) next_steps.torch(data_type=to.get_default_dtype()) # Standardize rewards if self.standardize_rew: rewards = standardize(steps.rewards).unsqueeze(1) else: rewards = steps.rewards.unsqueeze(1) rew_scale = 1. rewards *= rew_scale with to.no_grad(): # Create masks for the non-final observations not_done = to.tensor(1. - steps.done, dtype=to.get_default_dtype()).unsqueeze(1) # Compute the (next)state-(next)action values Q(s',a') from the target networks if self.policy.is_recurrent: next_act_expl, next_log_probs, _ = self._expl_strat( next_steps.observations, next_steps.hidden_states) else: next_act_expl, next_log_probs = self._expl_strat( next_steps.observations) next_q_val_target_1 = self.q_targ_1( to.cat([next_steps.observations, next_act_expl], dim=1)) next_q_val_target_2 = self.q_targ_2( to.cat([next_steps.observations, next_act_expl], dim=1)) next_q_val_target_min = to.min( next_q_val_target_1, next_q_val_target_2) - self.alpha * next_log_probs next_q_val = rewards + not_done * self.gamma * next_q_val_target_min # Compute the two Q-function losses # E_{(s_t, a_t) ~ D} [1/2 * (Q_i(s_t, a_t) - r_t - gamma * E_{s_{t+1} ~ p} [V(s_{t+1})] )^2] q_val_1 = self.q_fcn_1( to.cat([steps.observations, steps.actions], dim=1)) q_val_2 = self.q_fcn_2( to.cat([steps.observations, steps.actions], dim=1)) q_1_loss = nn.functional.mse_loss(q_val_1, next_q_val) q_2_loss = nn.functional.mse_loss(q_val_2, next_q_val) q_fcn_1_losses[b] = q_1_loss.data q_fcn_2_losses[b] = q_2_loss.data # Compute the policy loss # E_{s_t ~ D, eps_t ~ N} [log( pi( f(eps_t; s_t) ) ) - Q(s_t, f(eps_t; s_t))] if self.policy.is_recurrent: act_expl, log_probs, _ = self._expl_strat( steps.observations, steps.hidden_states) else: act_expl, log_probs = self._expl_strat(steps.observations) q1_pi = self.q_fcn_1(to.cat([steps.observations, act_expl], dim=1)) q2_pi = self.q_fcn_2(to.cat([steps.observations, act_expl], dim=1)) min_q_pi = to.min(q1_pi, q2_pi) policy_loss = to.mean(self.alpha * log_probs - min_q_pi) policy_losses[b] = policy_loss.data expl_strat_stds[b] = to.mean(self._expl_strat.std.data) # Do one optimization step for each optimizer, and clip the gradients if desired # Q-fcn 1 self._optim_q_fcn_1.zero_grad() q_1_loss.backward() q_fcn_1_grad_norm[b] = self.clip_grad(self.q_fcn_1, None) self._optim_q_fcn_1.step() # Q-fcn 2 self._optim_q_fcn_2.zero_grad() q_2_loss.backward() q_fcn_2_grad_norm[b] = self.clip_grad(self.q_fcn_2, None) self._optim_q_fcn_2.step() # Policy self._optim_policy.zero_grad() policy_loss.backward() policy_grad_norm[b] = self.clip_grad(self._expl_strat.policy, self.max_grad_norm) self._optim_policy.step() if self.learn_alpha: # Compute entropy coefficient loss alpha_loss = -to.mean( self._log_alpha * (log_probs.detach() + self.target_entropy)) # Do one optimizer step for the entropy coefficient optimizer self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() # Soft-update the target networks if (self._curr_iter * self.num_batch_updates + b) % self.target_update_intvl == 0: SAC.soft_update(self.q_targ_1, self.q_fcn_1, self.tau) SAC.soft_update(self.q_targ_2, self.q_fcn_2, self.tau) # Update the learning rate if the schedulers have been specified if self._lr_scheduler_policy is not None: self._lr_scheduler_policy.step() self._lr_scheduler_q_fcn_1.step() self._lr_scheduler_q_fcn_2.step() # Logging self.logger.add_value('Q1 loss', to.mean(q_fcn_1_losses).item()) self.logger.add_value('Q2 loss', to.mean(q_fcn_2_losses).item()) self.logger.add_value('policy loss', to.mean(policy_losses).item()) self.logger.add_value('avg policy grad norm', to.mean(policy_grad_norm).item()) self.logger.add_value('avg expl strat std', to.mean(expl_strat_stds).item()) self.logger.add_value('alpha', self.alpha.item()) if self._lr_scheduler_policy is not None: self.logger.add_value('learning rate', self._lr_scheduler_policy.get_lr()) def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This instance is not a subroutine of a meta-algorithm joblib.dump(self._env, osp.join(self._save_dir, 'env.pkl')) to.save(self.q_targ_1, osp.join(self._save_dir, 'target1.pt')) to.save(self.q_targ_2, osp.join(self._save_dir, 'target2.pt')) else: # This algorithm instance is a subroutine of a meta-algorithm if 'prefix' in meta_info and 'suffix' in meta_info: to.save( self.q_targ_1, osp.join( self._save_dir, f"{meta_info['prefix']}_target1_{meta_info['suffix']}.pt" )) to.save( self.q_targ_2, osp.join( self._save_dir, f"{meta_info['prefix']}_target2_{meta_info['suffix']}.pt" )) elif 'prefix' in meta_info and 'suffix' not in meta_info: to.save( self.q_targ_1, osp.join(self._save_dir, f"{meta_info['prefix']}_target1.pt")) to.save( self.q_targ_2, osp.join(self._save_dir, f"{meta_info['prefix']}_target2.pt")) elif 'prefix' not in meta_info and 'suffix' in meta_info: to.save( self.q_targ_1, osp.join(self._save_dir, f"target1_{meta_info['suffix']}.pt")) to.save( self.q_targ_2, osp.join(self._save_dir, f"target2_{meta_info['suffix']}.pt")) else: raise NotImplementedError def load_snapshot(self, load_dir: str = None, meta_info: dict = None): # Get the directory to load from ld = load_dir if load_dir is not None else self._save_dir super().load_snapshot(ld, meta_info) if meta_info is None: # This algorithm instance is not a subroutine of a meta-algorithm self._env = joblib.load(osp.join(ld, 'env.pkl')) self.q_targ_1.load_state_dict( to.load(osp.join(ld, 'target1.pt')).state_dict()) self.q_targ_2.load_state_dict( to.load(osp.join(ld, 'target2.pt')).state_dict()) else: # This algorithm instance is a subroutine of a meta-algorithm if 'prefix' in meta_info and 'suffix' in meta_info: self.q_targ_1.load_state_dict( to.load( osp.join( ld, f"{meta_info['prefix']}_target1_{meta_info['suffix']}.pt" )).state_dict()) self.q_targ_2.load_state_dict( to.load( osp.join( ld, f"{meta_info['prefix']}_target2_{meta_info['suffix']}.pt" )).state_dict()) elif 'prefix' in meta_info and 'suffix' not in meta_info: self.q_targ_1.load_state_dict( to.load(osp.join( ld, f"{meta_info['prefix']}_target1.pt")).state_dict()) self.q_targ_2.load_state_dict( to.load(osp.join( ld, f"{meta_info['prefix']}_target2.pt")).state_dict()) elif 'prefix' not in meta_info and 'suffix' in meta_info: self.q_targ_1.load_state_dict( to.load(osp.join( ld, f"target1_{meta_info['suffix']}.pt")).state_dict()) self.q_targ_2.load_state_dict( to.load(osp.join( ld, f"target2_{meta_info['suffix']}.pt")).state_dict()) else: raise NotImplementedError def reset(self, seed: int = None): # Reset the exploration strategy, internal variables and the random seeds super().reset(seed) # Re-initialize sampler in case env or policy changed self.sampler.reinit() # Reset the replay memory self._memory.reset() # Reset the learning rate schedulers if self._lr_scheduler_policy is not None: self._lr_scheduler_policy.last_epoch = -1 if self._lr_scheduler_q_fcn_1 is not None: self._lr_scheduler_q_fcn_1.last_epoch = -1 if self._lr_scheduler_q_fcn_2 is not None: self._lr_scheduler_q_fcn_2.last_epoch = -1
def __init__(self, save_dir: str, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, value_fcn_coeff: float = 0.5, entropy_coeff: float = 1e-3, batch_size: int = 32, std_init: float = 1.0, max_grad_norm: float = None, num_sampler_envs: int = 4, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: number of iterations (policy updates) :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param value_fcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2 :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2 :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param num_sampler_envs: number of environments for parallel sampling :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm """ # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) # Store the inputs self.min_rollouts = min_rollouts self.min_steps = min_steps self.value_fcn_coeff = value_fcn_coeff self.entropy_coeff = entropy_coeff self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.sampler = ParallelSampler(env, self.expl_strat, num_envs=num_sampler_envs, min_steps=min_steps, min_rollouts=min_rollouts) self.optim = to.optim.RMSprop( [{ 'params': self._policy.parameters() }, { 'params': self.expl_strat.noise.parameters() }, { 'params': self._critic.value_fcn.parameters() }], lr=lr, eps=1e-5) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
plt.show() if __name__ == '__main__': # Set up environment dp_gt = dict(m=2., k=20., d=0.8) # ground truth dp_init = dict(m=1.0, k=24., d=0.4) # initial guess dt = 1/50. env = OneMassOscillatorSim(dt=dt, max_steps=400) env.reset(domain_param=dp_gt) # Set up policy policy = DummyPolicy(env.spec) # Sample sampler = ParallelSampler(env, policy, num_envs=1, min_rollouts=50, seed=1) ros = sampler.sample() # Pyro pyro.set_rng_seed(1001) pyro.enable_validation(True) train( SVI(model=model, guide=guide, optim=optim.Adam({'lr': 0.01}), # optim=optim.SGD({'lr': 0.001, 'momentum': 0.1}), loss=Trace_ELBO()), rollouts=ros, prior=dp_init )
def __init__(self, save_dir: str, env: Env, policy: DiscrActQValFNNPolicy, memory_size: int, eps_init: float, eps_schedule_gamma: float, gamma: float, max_iter: int, num_batch_updates: int, target_update_intvl: int = 5, min_rollouts: int = None, min_steps: int = None, batch_size: int = 256, num_sampler_envs: int = 4, max_grad_norm: float = 0.5, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: environment which the policy operates :param policy: (current) Q-network updated by this algorithm :param memory_size: number of transitions in the replay memory buffer :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma==1` :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon :param gamma: temporal discount factor for the state values :param max_iter: number of iterations (policy updates) :param num_batch_updates: number of batch updates per algorithm steps :param target_update_intvl: number of iterations that pass before updating the target network :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param batch_size: number of samples per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(policy, DiscrActQValFNNPolicy): raise pyrado.TypeErr(given=policy, expected_type=DiscrActQValFNNPolicy) if logger is None: # Create logger that only logs every 100 steps of the algorithm logger = StepLogger(print_interval=100) logger.printers.append(ConsolePrinter()) logger.printers.append(CSVPrinter(osp.join(save_dir, 'progress.csv'))) # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Store the inputs self._env = env self.target = deepcopy(self._policy) self.target.eval() # will not be trained using the optimizer self._memory_size = memory_size self.eps = eps_init self.gamma = gamma self.target_update_intvl = target_update_intvl self.num_batch_updates = num_batch_updates self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init, eps_schedule_gamma) self._memory = ReplayMemory(memory_size) self.sampler = ParallelSampler( env, self._expl_strat, num_envs=1, min_steps=min_steps, min_rollouts=min_rollouts ) self.sampler_eval = ParallelSampler( env, self._policy, num_envs=num_sampler_envs, min_steps=100*env.max_steps, min_rollouts=None ) self.optim = to.optim.RMSprop([{'params': self._policy.parameters()}], lr=lr) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
class DQL(Algorithm): """ Deep Q-Learning (without bells and whistles) .. seealso:: [1] V. Mnih et.al., "Human-level control through deep reinforcement learning", Nature, 2015 """ name: str = 'dql' def __init__(self, save_dir: str, env: Env, policy: DiscrActQValFNNPolicy, memory_size: int, eps_init: float, eps_schedule_gamma: float, gamma: float, max_iter: int, num_batch_updates: int, target_update_intvl: int = 5, min_rollouts: int = None, min_steps: int = None, batch_size: int = 256, num_sampler_envs: int = 4, max_grad_norm: float = 0.5, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: environment which the policy operates :param policy: (current) Q-network updated by this algorithm :param memory_size: number of transitions in the replay memory buffer :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma==1` :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon :param gamma: temporal discount factor for the state values :param max_iter: number of iterations (policy updates) :param num_batch_updates: number of batch updates per algorithm steps :param target_update_intvl: number of iterations that pass before updating the target network :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param batch_size: number of samples per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(policy, DiscrActQValFNNPolicy): raise pyrado.TypeErr(given=policy, expected_type=DiscrActQValFNNPolicy) if logger is None: # Create logger that only logs every 100 steps of the algorithm logger = StepLogger(print_interval=100) logger.printers.append(ConsolePrinter()) logger.printers.append(CSVPrinter(osp.join(save_dir, 'progress.csv'))) # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Store the inputs self._env = env self.target = deepcopy(self._policy) self.target.eval() # will not be trained using the optimizer self._memory_size = memory_size self.eps = eps_init self.gamma = gamma self.target_update_intvl = target_update_intvl self.num_batch_updates = num_batch_updates self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init, eps_schedule_gamma) self._memory = ReplayMemory(memory_size) self.sampler = ParallelSampler( env, self._expl_strat, num_envs=1, min_steps=min_steps, min_rollouts=min_rollouts ) self.sampler_eval = ParallelSampler( env, self._policy, num_envs=num_sampler_envs, min_steps=100*env.max_steps, min_rollouts=None ) self.optim = to.optim.RMSprop([{'params': self._policy.parameters()}], lr=lr) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam) @property def expl_strat(self) -> EpsGreedyExplStrat: return self._expl_strat @property def memory(self) -> ReplayMemory: """ Get the replay memory. """ return self._memory def step(self, snapshot_mode: str, meta_info: dict = None): # Sample steps and store them in the replay memory ros = self.sampler.sample() self._memory.push(ros) while len(self._memory) < self.memory.capacity: # Warm-up phase print_cbt('Collecting samples until replay memory contains if full.', 'w') # Sample steps and store them in the replay memory ros = self.sampler.sample() self._memory.push(ros) # Log return-based metrics if self._curr_iter % self.logger.print_interval == 0: ros = self.sampler_eval.sample() rets = [ro.undiscounted_return() for ro in ros] ret_max = np.max(rets) ret_med = np.median(rets) ret_avg = np.mean(rets) ret_min = np.min(rets) ret_std = np.std(rets) else: ret_max, ret_med, ret_avg, ret_min, ret_std = 5*[-pyrado.inf] # dummy values self.logger.add_value('max return', np.round(ret_max, 4)) self.logger.add_value('median return', np.round(ret_med, 4)) self.logger.add_value('avg return', np.round(ret_avg, 4)) self.logger.add_value('min return', np.round(ret_min, 4)) self.logger.add_value('std return', np.round(ret_std, 4)) self.logger.add_value('avg rollout length', np.round(np.mean([ro.length for ro in ros]), 2)) self.logger.add_value('num rollouts', len(ros)) self.logger.add_value('avg memory reward', np.round(self._memory.avg_reward(), 4)) # Use data in the memory to update the policy and the target Q-function self.update() # Save snapshot data self.make_snapshot(snapshot_mode, float(ret_avg), meta_info) def loss_fcn(self, q_vals: to.Tensor, expected_q_vals: to.Tensor) -> to.Tensor: r""" The Huber loss function on the one-step TD error $\delta = Q(s,a) - (r + \gamma \max_a Q(s^\prime, a))$. :param q_vals: state-action values $Q(s,a)$, from policy network :param expected_q_vals: expected state-action values $r + \gamma \max_a Q(s^\prime, a)$, from target network :return: loss value """ return nn.functional.smooth_l1_loss(q_vals, expected_q_vals) def update(self): """ Update the policy's and target Q-function's parameters on transitions sampled from the replay memory. """ losses = to.zeros(self.num_batch_updates) policy_grad_norm = to.zeros(self.num_batch_updates) for b in tqdm(range(self.num_batch_updates), total=self.num_batch_updates, desc=f'Updating', unit='batches', file=sys.stdout, leave=False): # Sample steps and the associated next step from the replay memory steps, next_steps = self._memory.sample(self.batch_size) steps.torch(data_type=to.get_default_dtype()) next_steps.torch(data_type=to.get_default_dtype()) # Create masks for the non-final observations not_done = to.tensor(1. - steps.done, dtype=to.get_default_dtype()) # Compute the state-action values Q(s,a) using the current DQN policy q_vals = self.expl_strat.policy.q_values_chosen(steps.observations) # Compute the second term of TD-error next_v_vals = self.target.q_values_chosen(next_steps.observations).detach() expected_q_val = steps.rewards + not_done*self.gamma*next_v_vals # Compute the loss, clip the gradients if desired, and do one optimization step loss = self.loss_fcn(q_vals, expected_q_val) losses[b] = loss.data self.optim.zero_grad() loss.backward() policy_grad_norm[b] = self.clip_grad(self.expl_strat.policy, self.max_grad_norm) self.optim.step() # Update the target network by copying all weights and biases from the DQN policy if (self._curr_iter*self.num_batch_updates + b)%self.target_update_intvl == 0: self.target.load_state_dict(self.expl_strat.policy.state_dict()) # Schedule the exploration parameter epsilon self.expl_strat.schedule_eps(self._curr_iter) # Update the learning rate if a scheduler has been specified if self._lr_scheduler is not None: self._lr_scheduler.step() # Logging with to.no_grad(): self.logger.add_value('loss after', to.mean(losses).item()) self.logger.add_value('expl strat eps', self.expl_strat.eps.item()) self.logger.add_value('avg policy grad norm', to.mean(policy_grad_norm).item()) if self._lr_scheduler is not None: self.logger.add_value('learning rate', self._lr_scheduler.get_lr()) def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This instance is not a subroutine of a meta-algorithm joblib.dump(self._env, osp.join(self._save_dir, 'env.pkl')) to.save(self.target, osp.join(self._save_dir, 'target.pt')) else: # This algorithm instance is a subroutine of a meta-algorithm if 'prefix' in meta_info and 'suffix' in meta_info: to.save(self.target, osp.join(self._save_dir, f"{meta_info['prefix']}_target_{meta_info['suffix']}.pt")) elif 'prefix' in meta_info and 'suffix' not in meta_info: to.save(self.target, osp.join(self._save_dir, f"{meta_info['prefix']}_target.pt")) elif 'prefix' not in meta_info and 'suffix' in meta_info: to.save(self.target, osp.join(self._save_dir, f"target_{meta_info['suffix']}.pt")) else: raise NotImplementedError def load_snapshot(self, load_dir: str = None, meta_info: dict = None): # Get the directory to load from ld = load_dir if load_dir is not None else self._save_dir super().load_snapshot(ld, meta_info) if meta_info is None: # This algorithm instance is not a subroutine of a meta-algorithm self._env = joblib.load(osp.join(ld, 'env.pkl')) self.target.load_state_dict(to.load(osp.join(ld, 'target.pt')).state_dict()) else: # This algorithm instance is a subroutine of a meta-algorithm if 'prefix' in meta_info and 'suffix' in meta_info: self.target.load_state_dict( to.load(osp.join(ld, f"{meta_info['prefix']}_target_{meta_info['suffix']}.pt")).state_dict() ) elif 'prefix' in meta_info and 'suffix' not in meta_info: self.target.load_state_dict( to.load(osp.join(ld, f"{meta_info['prefix']}_target.pt")).state_dict() ) elif 'prefix' not in meta_info and 'suffix' in meta_info: self.target.load_state_dict( to.load(osp.join(ld, f"target_{meta_info['suffix']}.pt")).state_dict() ) else: raise NotImplementedError def reset(self, seed: int = None): # Reset the exploration strategy, internal variables and the random seeds super().reset(seed) # Re-initialize sampler in case env or policy changed self.sampler.reinit() # Reset the replay memory self._memory.reset() # Reset the learning rate scheduler if self._lr_scheduler is not None: self._lr_scheduler.last_epoch = -1