def eval_policy(save_dir: [str, None], env: [RealEnv, SimEnv, MetaDomainRandWrapper], policy: Policy, mc_estimator: bool, prefix: str, num_rollouts: int, num_parallel_envs: int = 1) -> to.Tensor: """ Evaluate a policy on the target system (real-world platform). This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence bound (`False`) obtained from bootrapping :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True) rets_real = to.zeros(num_rollouts) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when conducting a sim-to-real experiment for i in range(num_rollouts): rets_real[i] = rollout(env, policy, eval=True).undiscounted_return() # If a reward of -1 is given, skip evaluation ahead and set all returns to zero if rets_real[i] == -1: print_cbt('Set all returns for this policy to zero.', color='c') rets_real = to.zeros(num_rollouts) break elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when conducting a sim-to-sim experiment sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts) ros = sampler.sample() for i in range(num_rollouts): rets_real[i] = ros[i].undiscounted_return() else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) if save_dir is not None: # Save the evaluation results to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt')) print_cbt('Target domain performance', bright=True) print(tabulate([['mean return', to.mean(rets_real).item()], ['std return', to.std(rets_real)], ['min return', to.min(rets_real)], ['max return', to.max(rets_real)]])) if mc_estimator: return to.mean(rets_real) else: return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean, num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 250., max_steps=1500) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(feats=FeatureStack([ identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat([4, 5]) ])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=50, pop_size=trial.suggest_int('pop_size', 50, 200), num_rollouts=trial.suggest_int('num_rollouts', 4, 10), num_is_samples=trial.suggest_int('num_is_samples', 5, 40), expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5), symm_sampling=trial.suggest_categorical('symm_sampling', [True, False]), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'), env, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def eval_policy( save_dir: Optional[pyrado.PathLike], env: Env, policy: Policy, prefix: str, num_rollouts: int, num_workers: int = 1, ) -> to.Tensor: """ Evaluate a policy either in the source or in the target domain. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_workers: number of environments for the parallel sampler (only used for SimEnv) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f"Executing {prefix}_policy ...", "c", bright=True) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when evaluating on a real-world device rets_real = [] for i in range(num_rollouts): rets_real.append( rollout(env, policy, eval=True).undiscounted_return()) elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when evaluating in a simulation sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=num_rollouts) ros = sampler.sample(eval=True) rets_real = [ro.undiscounted_return() for ro in ros] else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype()) if save_dir is not None: # Save and print the evaluation results pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix) print_cbt("Target domain performance", bright=True) print( tabulate([ ["mean return", to.mean(rets_real).item()], ["std return", to.std(rets_real)], ["min return", to.min(rets_real)], ["max return", to.max(rets_real)], ])) return to.mean(rets_real)
def test_parallel_rollout_sampler(env: SimEnv, policy: Policy, num_workers: int): min_rollouts = num_workers * 2 # make sure every worker samples at least once sampler = ParallelRolloutSampler(env, policy, num_workers, min_rollouts=min_rollouts) ros = sampler.sample() assert isinstance(ros, list) assert len(ros) >= min_rollouts
def test_cuda_sampling_w_dr(env: SimEnv, policy: Policy, num_workers: int): randomizer = create_default_randomizer(env) env = DomainRandWrapperLive(env, randomizer) sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=4) samples = sampler.sample() assert samples is not None
def test_parallel_sampling_deterministic_wo_min_steps( env: SimEnv, policy: Policy, min_rollouts: Optional[int], init_states: Optional[int], domain_params: Optional[List[dict]], ): env.max_steps = 20 if init_states is not None: init_states = [ env.spec.state_space.sample_uniform() for _ in range(init_states) ] nums_workers = (1, 2, 4) all_rollouts = [] for num_workers in nums_workers: # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed # anyway). all_rollouts.append( ParallelRolloutSampler( env, NormalActNoiseExplStrat(policy, std_init=1.0), num_workers=num_workers, min_rollouts=min_rollouts, seed=0, ).sample(init_states=init_states, domain_params=domain_params)) # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts. for ros in all_rollouts: for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]: # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions # are different when using the idle policy. if isinstance(policy, IdlePolicy): # The Quanser Ball Balancer is a deterministic environment (conditioned on the initial state). As the # idle policy is a deterministic policy, this will result in the rollouts being equivalent for each # initial state, so do not check for difference if the initial states where set. if init_states is None: assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx( ro_b.observations) else: assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx(ro_b.observations) assert ro_a.actions != pytest.approx(ro_b.actions) # Test that the rollouts for all number of workers are equal. for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]: assert len(ros_a) == len(ros_b) for ro_a, ro_b in zip(ros_a, ros_b): assert ro_a.rewards == pytest.approx(ro_b.rewards) assert ro_a.observations == pytest.approx(ro_b.observations) assert ro_a.actions == pytest.approx(ro_b.actions)
def __init__(self, save_dir: str, env: SimEnv, policy: Policy, min_rollouts: int = None, min_steps: int = None, num_workers: int = 4, logger: StepLogger = None, ball_z_dim_mismatch: bool = True): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy which this algorithm is creating :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_workers: number of environments for parallel sampling :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim, set to True if the controller does not have the z component (relative position) of the ball in the state vector, i.e. state is 14-dim instead of 16-dim """ if not isinstance(env, SimEnv): raise pyrado.TypeErr(given=env, expected_type=SimEnv) if not isinstance(policy, LinearPolicy): raise pyrado.TypeErr(given=policy, expected_type=LinearPolicy) # Call Algorithm's constructor super().__init__(save_dir, 1, policy, logger) # Store the inputs self._env = env self.ball_z_dim_mismatch = ball_z_dim_mismatch self.sampler = ParallelRolloutSampler( env, self._policy, num_workers=num_workers, min_steps=min_steps, min_rollouts=min_rollouts ) self.eigvals = np.array([pyrado.inf]) # initialize with sth positive
def test_adr_reward_generator(env): reference_env = env random_env = deepcopy(env) reward_generator = RewardGenerator( env_spec=random_env.spec, batch_size=256, reward_multiplier=1, lr=5e-3, ) policy = FNNPolicy(reference_env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) dr = create_default_randomizer_omo() dr.randomize(num_samples=1) random_env.domain_param = dr.get_params(fmt="dict", dtype="numpy") reference_sampler = ParallelRolloutSampler(reference_env, policy, num_workers=1, min_steps=1000) random_sampler = ParallelRolloutSampler(random_env, policy, num_workers=1, min_steps=1000) losses = [] for i in range(200): reference_traj = StepSequence.concat(reference_sampler.sample()) random_traj = StepSequence.concat(random_sampler.sample()) losses.append(reward_generator.train(reference_traj, random_traj, 10)) assert losses[len(losses) - 1] < losses[0]
def test_sequential_equals_parallel(env: SimEnv, policy: Policy, num_simulations: int): # Do the rollouts explicitly sequentially without a sampler # Do not set the init state to check if this was sampled correctly ros_sequential = [] for i in range(num_simulations): ros_sequential.append(rollout(env, policy, eval=True, seed=i)) # Do the rollouts in parallel with a sampler. Create one worker for every rollout # Do not set the init state to check if this was sampled correctly sampler = ParallelRolloutSampler(env, policy, num_workers=num_simulations, min_rollouts=num_simulations, seed=0) ros_parallel = sampler.sample() assert len(ros_parallel) == num_simulations for ro_s in ros_sequential: # The parallel rollouts are not necessarily in the same order as the sequential ones, thus compare to all assert any([ ro_s.observations == pytest.approx(ro_p.observations) for ro_p in ros_parallel ])
def test_sequential_equals_parallel(env: SimEnv, policy: Policy, num_rollouts: int, num_workers: int): # Do the rollouts explicitly sequentially without a sampler. # Do not set the init state to check if this was sampled correctly. ros_sequential = [] for i in range(num_rollouts): ros_sequential.append( rollout(env, policy, eval=True, seed=0, sub_seed=0, sub_sub_seed=i)) # Do the rollouts in parallel with a sampler. # Do not set the init state to check if this was sampled correctly. sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=num_rollouts, seed=0) ros_parallel = sampler.sample() assert len(ros_parallel) == num_rollouts for ro_s, ro_p in zip(ros_sequential, ros_parallel): assert ro_s.rewards == pytest.approx(ro_p.rewards) assert ro_s.observations == pytest.approx(ro_p.observations) assert ro_s.actions == pytest.approx(ro_p.actions)
def test_parallel_sampling_deterministic_w_min_steps( env: SimEnv, policy: Policy, min_rollouts: Optional[int], min_steps: int, domain_params: Optional[List[dict]], ): env.max_steps = 20 nums_workers = (1, 2, 4) all_rollouts = [] for num_workers in nums_workers: # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed # anyway). all_rollouts.append( ParallelRolloutSampler( env, NormalActNoiseExplStrat(policy, std_init=1.0), num_workers=num_workers, min_rollouts=min_rollouts, min_steps=min_steps * env.max_steps, seed=0, ).sample(domain_params=domain_params)) # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts. for ros in all_rollouts: for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]: # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions # are different when using the idle policy. if not isinstance(policy, IdlePolicy): assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx(ro_b.observations) assert ro_a.actions != pytest.approx(ro_b.actions) # Test that the rollouts for all number of workers are equal. for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]: assert sum([len(ro) for ro in ros_a]) == sum([len(ro) for ro in ros_b]) assert sum([len(ro) for ro in ros_a]) >= min_steps * env.max_steps assert sum([len(ro) for ro in ros_b]) >= min_steps * env.max_steps assert len(ros_a) == len(ros_b) if min_rollouts is not None: assert len(ros_a) >= min_rollouts assert len(ros_b) >= min_rollouts for ro_a, ro_b in zip(ros_a, ros_b): assert ro_a.rewards == pytest.approx(ro_b.rewards) assert ro_a.observations == pytest.approx(ro_b.observations) assert ro_a.actions == pytest.approx(ro_b.actions)
def __init__(self, save_dir: str, env: Env, policy: DiscreteActQValPolicy, memory_size: int, eps_init: float, eps_schedule_gamma: float, gamma: float, max_iter: int, num_batch_updates: int, target_update_intvl: int = 5, num_init_memory_steps: int = None, min_rollouts: int = None, min_steps: int = None, batch_size: int = 256, num_workers: int = 4, max_grad_norm: float = 0.5, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: (current) Q-network updated by this algorithm :param memory_size: number of transitions in the replay memory buffer :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma=1` :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon :param gamma: temporal discount factor for the state values :param max_iter: number of iterations (policy updates) :param num_batch_updates: number of batch updates per algorithm steps :param target_update_intvl: number of iterations that pass before updating the qfcn_targ network :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to fill the buffer completely :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param batch_size: number of samples per policy update batch :param num_workers: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, DiscreteActQValPolicy): raise pyrado.TypeErr(given=policy, expected_type=DiscreteActQValPolicy) # Call ValueBased's constructor super().__init__(save_dir, env, policy, memory_size, gamma, max_iter, num_batch_updates, target_update_intvl, num_init_memory_steps, min_rollouts, min_steps, batch_size, num_workers, max_grad_norm, logger) self.qfcn_targ = deepcopy( self._policy).eval() # will not be trained using the optimizer self.eps = eps_init # Create sampler for exploration during training self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init, eps_schedule_gamma) self.sampler_trn = ParallelRolloutSampler( self._env, self._expl_strat, num_workers=num_workers if min_steps != 1 else 1, min_steps=min_steps, min_rollouts=min_rollouts) # Q-function optimizer self.optim = to.optim.RMSprop([{ 'params': self._policy.parameters() }], lr=lr) # Learning rate scheduler self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
if __name__ == "__main__": # Set up environment dp_gt = dict(m=2.0, k=20.0, d=0.8) # ground truth dp_init = dict(m=1.0, k=22.0, d=0.4) # initial guess dt = 1 / 50.0 env = OneMassOscillatorSim(dt=dt, max_steps=400) env.reset(domain_param=dp_gt) # Set up policy # policy = IdlePolicy(env.spec) policy = DummyPolicy(env.spec) # Sample sampler = ParallelRolloutSampler(env, policy, num_workers=4, min_rollouts=50, seed=1) ros = sampler.sample() # Create a model for learning the domain parameters model = OneMassOscillatorDomainParamEstimator(dt=dt, dp_init=dp_init, num_epoch=50, batch_size=10) model.update(ros) print_cbt(f"true domain param : {dp_gt}", "g") print_cbt(f"initial domain param: {dp_init}", "y") print_cbt(f"learned domain param: {model.dp_est.detach().cpu().numpy()}",
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical("shared_hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict( hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( "min_steps_algo", [1]), # 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform("memory_size_algo", 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform("tau_algo", 0.99, 1.0), ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9), learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo", [True, False]), standardize_rew=trial.suggest_categorical("standardize_rew_algo", [False]), gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0), target_update_intvl=trial.suggest_categorical( "target_update_intvl_algo", [1, 5]), num_updates_per_step=trial.suggest_categorical( "num_batch_updates_algo", [1, 5]), batch_size=trial.suggest_categorical("batch_size_algo", [128, 256, 512]), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f"trial_{trial.number}")) algo = SAC(study_dir, env, policy, qfcn_1, qfcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, num_epoch: int = 3, eps_clip: float = 0.1, batch_size: int = 64, std_init: float = 1.0, num_workers: int = 4, max_grad_norm: Optional[float] = None, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_epoch: number of iterations over all gathered samples during one policy update :param eps_clip: max/min probability ratio, see [1] :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param num_workers: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created .. note:: The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler schedules the maximum learning rate. """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) assert isinstance(policy, Policy) # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) # Store the inputs self.num_epoch = num_epoch self.eps_clip = eps_clip self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self.log_loss = True self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self._sampler = ParallelRolloutSampler(env, self._expl_strat, num_workers=num_workers, min_steps=min_steps, min_rollouts=min_rollouts) self.optim = to.optim.Adam( [{ "params": self._expl_strat.policy.parameters() }, { "params": self._expl_strat.noise.parameters() }], lr=lr, eps=1e-5, ) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
from tabulate import tabulate from pyrado.environment_wrappers.action_normalization import ActNormWrapper from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim from pyrado.policies.features import FeatureStack, identity_feat, squared_feat from pyrado.policies.feed_back.linear import LinearPolicy from pyrado.sampling.parallel_rollout_sampler import ParallelRolloutSampler if __name__ == "__main__": # Set up environment env = BallOnBeamSim(dt=0.02, max_steps=500) env = ActNormWrapper(env) # Set up policy feats = FeatureStack(identity_feat, squared_feat) policy = LinearPolicy(env.spec, feats) # Set up sampler sampler = ParallelRolloutSampler(env, policy, num_workers=2, min_rollouts=2000) # Sample and print ros = sampler.sample() print( tabulate({ "StepSequence count": len(ros), "Step count": sum(map(len, ros)), }.items()))
class ARPL(Algorithm): """ Adversarially Robust Policy Learning (ARPL) .. seealso:: A. Mandlekar, Y. Zhu, A. Garg, L. Fei-Fei, S. Savarese, "Adversarially Robust Policy Learning: Active Construction of Physically-Plausible Perturbations", IROS, 2017 """ name: str = 'arpl' def __init__(self, save_dir: str, env: [SimEnv, StateAugmentationWrapper], subrtn: Algorithm, policy: Policy, expl_strat: StochasticActionExplStrat, max_iter: int, num_rollouts: int = None, steps_num: int = None, apply_dynamics_noise: bool = False, dyn_eps: float = 0.01, dyn_phi: float = 0.1, halfspan: float = 0.25, apply_proccess_noise: bool = False, proc_eps: float = 0.01, proc_phi: float = 0.05, apply_observation_noise: bool = False, obs_eps: float = 0.01, obs_phi: float = 0.05, torch_observation: bool = True, num_workers: int = 4, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment in which the agent should be trained :param subrtn: algorithm which performs the policy / value-function optimization :param policy: policy to be updated :param expl_strat: the exploration strategy :param max_iter: the maximum number of iterations :param num_rollouts: the number of rollouts to be performed for each update step :param steps_num: the number of steps to be performed for each update step :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied :param dyn_eps: the intensity of generated dynamics noise :param dyn_phi: the probability of applying dynamics noise :param halfspan: the halfspan of the uniform random distribution used to sample :param apply_proccess_noise: whether adversarially generated process noise should be applied :param proc_eps: the intensity of generated process noise :param proc_phi: the probability of applying process noise :param apply_observation_noise: whether adversarially generated observation noise should be applied :param obs_eps: the intensity of generated observation noise :param obs_phi: the probability of applying observation noise :param torch_observation: a function to provide a differentiable observation :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ assert isinstance(subrtn, Algorithm) assert isinstance(max_iter, int) and max_iter > 0 super().__init__(save_dir, max_iter, policy, logger) # Initialize adversarial wrappers if apply_dynamics_noise: assert isinstance(env, StateAugmentationWrapper) env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps, dyn_phi, halfspan) if apply_proccess_noise: env = AdversarialStateWrapper(env, self.policy, proc_eps, proc_phi, torch_observation=torch_observation) if apply_observation_noise: env = AdversarialObservationWrapper(env, self.policy, obs_eps, obs_phi) self.num_rollouts = num_rollouts self.sampler = ParallelRolloutSampler( env, expl_strat, num_workers=num_workers, min_steps=steps_num, min_rollouts=num_rollouts, ) # Subroutine self._subrtn = subrtn self._subrtn.save_name = 'subrtn' @property def sample_count(self) -> int: return self._subrtn.sample_count def step(self, snapshot_mode: str, meta_info: dict = None): rollouts = self.sampler.sample() rets = [ro.undiscounted_return() for ro in rollouts] ret_avg = np.mean(rets) ret_med = np.median(rets) ret_std = np.std(rets) self.logger.add_value('avg return', ret_avg) self.logger.add_value('median return', ret_med) self.logger.add_value('std return', ret_std) self.logger.add_value('num total samples', self._cnt_samples) self.logger.add_value('avg rollout len', np.mean([ro.length for ro in rollouts])) # Sub-routine self._subrtn.update(rollouts) self._subrtn.logger.record_step() self._subrtn.make_snapshot(snapshot_mode, ret_avg.item()) def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of a meta-algorithm self._subrtn.save_snapshot(meta_info) else: raise pyrado.ValueErr( msg=f'{self.name} is not supposed be run as a subroutine!')
class SysIdViaEpisodicRL(Algorithm): """ Wrapper to frame black-box system identification as an episodic reinforcement learning problem .. note:: This algorithm was designed as a subroutine of SimOpt. However, it could also be used independently. """ name: str = "sysiderl" iteration_key: str = "sysiderl_iteration" # logger's iteration key def __init__( self, subrtn: ParameterExploring, behavior_policy: Policy, num_rollouts_per_distr: int, metric: Union[Callable[[np.ndarray], np.ndarray], None], obs_dim_weight: Union[list, np.ndarray], std_obs_filt: int = 5, w_abs: float = 0.5, w_sq: float = 1.0, num_workers: int = 4, base_seed: int = 1001, ): """ Constructor :param subrtn: wrapped algorithm to fit the domain parameter distribution :param behavior_policy: lower level policy used to generate the rollouts :param num_rollouts_per_distr: number of rollouts per domain distribution parameter set :param metric: functional mapping from differences in observations to value :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions for the default metric :param std_obs_filt: number of standard deviations for the Gaussian filter applied to the observaitons :param w_abs: weight for the mean absolute errors for the default metric :param w_sq: weight for the mean squared errors for the default metric :param num_workers: number of environments for parallel sampling :param base_seed: seed to set for the parallel sampler in every iteration """ if not isinstance(subrtn, ParameterExploring): raise pyrado.TypeErr(given=subrtn, expected_type=ParameterExploring) if not isinstance(subrtn.env, MetaDomainRandWrapper): raise pyrado.TypeErr(given=subrtn.env, expected_type=MetaDomainRandWrapper) if not isinstance(subrtn.policy, DomainDistrParamPolicy): raise pyrado.TypeErr(given=subrtn.policy, expected_type=DomainDistrParamPolicy) if not isinstance(behavior_policy, Policy): raise pyrado.TypeErr(given=behavior_policy, expected_type=Policy) if subrtn.policy.num_param != len(subrtn.env.dp_mapping): raise pyrado.ShapeErr( msg= f"Number of policy parameters {subrtn.policy.num_param} does not match the" f"number of domain distribution parameters {len(subrtn.env.dp_mapping)}!" ) if subrtn.sampler.num_init_states_per_domain != 1: # Only sample one rollout in every domain. This is possible since we are synchronizing the init state. raise pyrado.ValueErr( given=subrtn.sampler.num_init_states_per_domain, eq_constraint="1") if num_rollouts_per_distr < 2: raise pyrado.ValueErr(given=num_rollouts_per_distr, g_constraint="1") if len(obs_dim_weight) != subrtn.env.obs_space.flat_dim: raise pyrado.ShapeErr(given=obs_dim_weight, expected_match=subrtn.env.obs_space) # Call Algorithm's constructor super().__init__(subrtn.save_dir, subrtn.max_iter, subrtn.policy, subrtn.logger) self._subrtn = subrtn self._subrtn.save_name = "subrtn" self._behavior_policy = behavior_policy self.obs_dim_weight = np.diag( obs_dim_weight ) # weighting factor between the different observations self.std_obs_filt = std_obs_filt if metric is None or metric == "None": self.metric = partial(self.weighted_l1_l2_metric, w_abs=w_abs, w_sq=w_sq, obs_dim_weight=self.obs_dim_weight) else: self.metric = metric # Get and optionally clip the observation bounds of the environment elb, eub = subrtn.env.obs_space.bound_lo, subrtn.env.obs_space.bound_up elb, eub = self.override_obs_bounds(elb, eub, subrtn.env.obs_space.labels) self.obs_normalizer = UnitCubeProjector(bound_lo=elb, bound_up=eub) # Create the sampler used to execute the same policy as on the real system in the meta-randomized env self.base_seed = base_seed self.behavior_sampler = ParallelRolloutSampler(self._subrtn.env, self._behavior_policy, num_workers=num_workers, min_rollouts=1, seed=base_seed) self.num_rollouts_per_distr = num_rollouts_per_distr @property def subrtn(self) -> ParameterExploring: """Get the subroutine used for updating the domain parameter distribution.""" return self._subrtn def reset(self, seed: int = None): # Reset internal variables inherited from Algorithm self._curr_iter = 0 self._cnt_samples = 0 self._highest_avg_ret = -pyrado.inf # Forward to subroutine self._subrtn.reset(seed) def step(self, snapshot_mode: str, meta_info: dict = None): if "rollouts_real" not in meta_info: raise pyrado.KeyErr(keys="rollouts_real", container=meta_info) # Extract the initial states from the real rollouts rollouts_real = meta_info["rollouts_real"] init_states_real = [ro.states[0, :] for ro in rollouts_real] # Sample new policy parameters a.k.a domain distribution parameters param_sets = self._subrtn.expl_strat.sample_param_sets( nominal_params=self._subrtn.policy.param_values, num_samples=self._subrtn.pop_size, include_nominal_params=True, ) # Iterate over every domain parameter distribution. We basically mimic the ParameterExplorationSampler here, # but we need to adapt the randomizer (and not just the domain parameters) por every policy param set param_samples = [] loss_hist = [] for idx_ps, ps in enumerate(param_sets): # Update the randomizer to use the new new_ddp_vals = self._subrtn.policy.transform_to_ddp_space(ps) self._subrtn.env.adapt_randomizer( domain_distr_param_values=new_ddp_vals.detach().cpu().numpy()) self._subrtn.env.randomizer.randomize( num_samples=self.num_rollouts_per_distr) sampled_domain_params = self._subrtn.env.randomizer.get_params() # Sample the rollouts rollouts_sim = self.behavior_sampler.sample(init_states_real, sampled_domain_params, eval=True) # Iterate over simulated rollout with the same initial state for idx_real, idcs_sim in enumerate( gen_ordered_batch_idcs(self.num_rollouts_per_distr, len(rollouts_sim), sorted=True)): # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts ros_real_tr, ros_sim_tr = self.truncate_rollouts( [rollouts_real[idx_real]], rollouts_sim[slice(idcs_sim[0], idcs_sim[-1] + 1)]) # Check the validity of the initial states. The domain parameters will be different. assert len(ros_real_tr) == len(ros_sim_tr) == len(idcs_sim) assert check_all_equal([ro.states[0, :] for ro in ros_real_tr]) assert check_all_equal([ro.states[0, :] for ro in ros_sim_tr]) assert all([ np.allclose(r.states[0, :], s.states[0, :]) for r, s in zip(ros_real_tr, ros_sim_tr) ]) # Compute the losses losses = np.asarray([ self.loss_fcn(ro_r, ro_s) for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr) ]) if np.all(losses == 0.0): raise pyrado.ValueErr( msg= "All SysIdViaEpisodicRL losses are equal to zero! Most likely the domain" "randomization is too extreme, such that every trajectory is done after" "one step. Check the exploration strategy.") # Handle zero losses by setting them to the maximum current loss losses[losses == 0] = np.max(losses) loss_hist.extend(losses) # We need to assign the loss value to the simulated rollout, but this one can be of a different # length than the real-world rollouts as well as of different length than the original # (non-truncated) simulated rollout. Thus, we simply write the loss value into the first step. for i, l in zip(range(idcs_sim[0], idcs_sim[-1] + 1), losses): rollouts_sim[i].rewards[:] = 0.0 rollouts_sim[i].rewards[0] = -l # Collect the results param_samples.append( ParameterSample(params=ps, rollouts=rollouts_sim)) # Bind the parameter samples and their rollouts in the usual container param_samp_res = ParameterSamplingResult(param_samples) self._cnt_samples += sum( [len(ro) for pss in param_samp_res for ro in pss.rollouts]) # Log metrics computed from the old policy (before the update) loss_hist = np.asarray(loss_hist) self.logger.add_value("min sysid loss", np.min(loss_hist), 6) self.logger.add_value("median sysid loss", np.median(loss_hist), 6) self.logger.add_value("avg sysid loss", np.mean(loss_hist), 6) self.logger.add_value("max sysid loss", np.max(loss_hist), 6) self.logger.add_value("std sysid loss", np.std(loss_hist), 6) # Extract the best policy parameter sample for saving it later self._subrtn.best_policy_param = param_samp_res.parameters[np.argmax( param_samp_res.mean_returns)].clone() # Save snapshot data self.make_snapshot(snapshot_mode, float(np.max(param_samp_res.mean_returns)), meta_info) # Update the wrapped algorithm's update method self._subrtn.update( param_samp_res, ret_avg_curr=param_samp_res[0].mean_undiscounted_return) @staticmethod def override_obs_bounds(bound_lo: np.ndarray, bound_up: np.ndarray, labels: np.ndarray) -> (np.ndarray, np.ndarray): """ Default overriding method for the bounds of an observation space. This is necessary when the observations are scaled with their range, e.g. to compare a deviation over different kinds of observations like position and annular velocity. Thus, infinite bounds are not feasible. :param bound_lo: lower bound of the observation space :param bound_up: upper bound of the observation space :param labels: label for each dimension of the observation space to override :return: clipped lower and upper bound """ bound_lo = ObsNormWrapper.override_bounds(bound_lo, { "theta_dot": -20.0, "alpha_dot": -20.0 }, labels) bound_up = ObsNormWrapper.override_bounds(bound_up, { "theta_dot": 20.0, "alpha_dot": 20.0 }, labels) return bound_lo, bound_up @staticmethod def weighted_l1_l2_metric(err: np.ndarray, w_abs: float, w_sq: float, obs_dim_weight: np.ndarray): """ Compute the weighted linear combination of the observation error's MAE and MSE, averaged over time .. note:: In contrast to [1], we are using the mean absolute error and the mean squared error instead of the L1 and the L2 norm. The reason for this is that longer time series would be punished otherwise. :param err: error signal with time steps along the first dimension :param w_abs: weight for the mean absolute errors :param w_sq: weight for the mean squared errors :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions :return: weighted linear combination of the error's MAE and MSE, averaged over time """ err_w = np.matmul(err, obs_dim_weight) return w_abs * np.mean(np.abs(err_w), axis=0) + w_sq * np.mean( np.power(err_w, 2), axis=0) def loss_fcn(self, rollout_real: StepSequence, rollout_sim: StepSequence) -> float: """ Compute the discrepancy between two time sequences of observations given metric. Be sure to align and truncate the rollouts beforehand. :param rollout_real: (concatenated) real-world rollout containing the observations :param rollout_sim: (concatenated) simulated rollout containing the observations :return: discrepancy cost summed over the observation dimensions """ if len(rollout_real) != len(rollout_sim): raise pyrado.ShapeErr(given=rollout_real, expected_match=rollout_sim) # Extract the observations real_obs = rollout_real.get_data_values("observations", truncate_last=True) sim_obs = rollout_sim.get_data_values("observations", truncate_last=True) # Filter the observations real_obs = gaussian_filter1d(real_obs, self.std_obs_filt, axis=0) sim_obs = gaussian_filter1d(sim_obs, self.std_obs_filt, axis=0) # Normalize the signals real_obs_norm = self.obs_normalizer.project_to(real_obs) sim_obs_norm = self.obs_normalizer.project_to(sim_obs) # Compute loss based on the error loss_per_obs_dim = self.metric(real_obs_norm - sim_obs_norm) assert len(loss_per_obs_dim) == real_obs.shape[1] assert all(loss_per_obs_dim >= 0) return sum(loss_per_obs_dim) @staticmethod def truncate_rollouts( rollouts_real: Sequence[StepSequence], rollouts_sim: Sequence[StepSequence], replicate: bool = True ) -> Tuple[Sequence[StepSequence], Sequence[StepSequence]]: """ In case (some of the) rollouts failed or succeed in one domain, but not in the other, we truncate the longer observation sequence. When truncating, we compare every of the M real rollouts to every of the N simulated rollouts, thus replicate the real rollout N times and the simulated rollouts M times. :param rollouts_real: M real-world rollouts of different length if `replicate = True`, else K real-world rollouts of different length :param rollouts_sim: N simulated rollouts of different length if `replicate = True`, else K simulated rollouts of different length :param replicate: if `False` the i-th rollout from `rollouts_real` is (only) compared with the i-th rollout from `rollouts_sim`, in this case the number of rollouts and the initial states have to match :return: MxN real-world rollouts and MxN simulated rollouts of equal length if `replicate = True`, else K real-world rollouts and K simulated rollouts of equal length """ if not isinstance(rollouts_real[0], Iterable): raise pyrado.TypeErr(given=rollouts_real[0], expected_type=Iterable) if not isinstance(rollouts_sim[0], Iterable): raise pyrado.TypeErr(given=rollouts_sim[0], expected_type=Iterable) if not replicate and len(rollouts_real) != len(rollouts_sim): raise pyrado.ShapeErr( msg= "In case of a one on one comparison, the number of rollouts needs to be equal!" ) # Choose the function for creating the comparison, the rollouts comp_fcn = product if replicate else zip # Go over all combinations rollouts individually rollouts_real_tr = [] rollouts_sim_tr = [] for ro_r, ro_s in comp_fcn(rollouts_real, rollouts_sim): # Handle rollouts of different length, assuming that they are staring at the same state if ro_r.length < ro_s.length: rollouts_real_tr.append(ro_r) rollouts_sim_tr.append(ro_s[:ro_r.length]) elif ro_r.length > ro_s.length: rollouts_real_tr.append(ro_r[:ro_s.length]) rollouts_sim_tr.append(ro_s) else: rollouts_real_tr.append(ro_r) rollouts_sim_tr.append(ro_s) return rollouts_real_tr, rollouts_sim_tr def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) # ParameterExploring subroutine saves the best policy (in this case a DomainDistrParamPolicy) prefix = meta_info.get("prefix", "") if prefix != "": self._subrtn.save_snapshot(meta_info=dict( prefix=f"{prefix}_ddp")) # save iter_X_ddp_policy.pt self._subrtn.save_snapshot( meta_info=dict(prefix="ddp")) # override ddp_policy.pt joblib.dump(self._subrtn.env, osp.join(self.save_dir, "env_sim.pkl")) # Print the current search distribution's mean cpp = self._subrtn.policy.transform_to_ddp_space( self._subrtn.policy.param_values) self._subrtn.env.adapt_randomizer( domain_distr_param_values=cpp.detach().cpu().numpy()) print_cbt( f"Current policy domain parameter distribution\n{self._subrtn.env.randomizer}", "g") # Set the randomizer to best fitted domain distribution cbp = self._subrtn.policy.transform_to_ddp_space( self._subrtn.best_policy_param) self._subrtn.env.adapt_randomizer( domain_distr_param_values=cbp.detach().cpu().numpy()) print_cbt( f"Best fitted domain parameter distribution\n{self._subrtn.env.randomizer}", "g") if "rollouts_real" not in meta_info: raise pyrado.KeyErr(keys="rollouts_real", container=meta_info) pyrado.save(meta_info["rollouts_real"], "rollouts_real.pkl", self.save_dir, prefix=prefix)
def __init__(self, save_dir: str, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, vfcn_coeff: float = 0.5, entropy_coeff: float = 1e-3, batch_size: int = 32, std_init: float = 1.0, max_grad_norm: float = None, num_workers: int = 4, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: number of iterations (policy updates) :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param vfcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2 :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2 :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param num_workers: number of environments for parallel sampling :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) # Store the inputs self.min_rollouts = min_rollouts self.min_steps = min_steps self.vfcn_coeff = vfcn_coeff self.entropy_coeff = entropy_coeff self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.sampler = ParallelRolloutSampler( env, self.expl_strat, num_workers=num_workers, min_steps=min_steps, min_rollouts=min_rollouts ) self.optim = to.optim.RMSprop( [{'params': self._policy.parameters()}, {'params': self.expl_strat.noise.parameters()}, {'params': self._critic.vfcn.parameters()}], lr=lr, eps=1e-5 ) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
class LQR(Algorithm): """ Linear Quadratic Regulator created using the control module """ name: str = 'lqr' def __init__(self, save_dir: str, env: SimEnv, policy: Policy, min_rollouts: int = None, min_steps: int = None, num_workers: int = 4, logger: StepLogger = None, ball_z_dim_mismatch: bool = True): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy which this algorithm is creating :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_workers: number of environments for parallel sampling :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim, set to True if the controller does not have the z component (relative position) of the ball in the state vector, i.e. state is 14-dim instead of 16-dim """ if not isinstance(env, SimEnv): raise pyrado.TypeErr(given=env, expected_type=SimEnv) if not isinstance(policy, LinearPolicy): raise pyrado.TypeErr(given=policy, expected_type=LinearPolicy) # Call Algorithm's constructor super().__init__(save_dir, 1, policy, logger) # Store the inputs self._env = env self.ball_z_dim_mismatch = ball_z_dim_mismatch self.sampler = ParallelRolloutSampler( env, self._policy, num_workers=num_workers, min_steps=min_steps, min_rollouts=min_rollouts ) self.eigvals = np.array([pyrado.inf]) # initialize with sth positive def step(self, snapshot_mode: str, meta_info: dict = None): if isinstance(inner_env(self._env), BallOnPlate5DSim): ctrl_gains = to.tensor([ [0.1401, 0, 0, 0, -0.09819, -0.1359, 0, 0.545, 0, 0, 0, -0.01417, -0.04427, 0], [0, 0.1381, 0, 0.2518, 0, 0, -0.2142, 0, 0.5371, 0, 0.03336, 0, 0, -0.1262], [0, 0, 0.1414, 0.0002534, 0, 0, -0.0002152, 0, 0, 0.5318, 0, 0, 0, -0.0001269], [0, -0.479, -0.0004812, 39.24, 0, 0, -15.44, 0, -1.988, -0.001934, 9.466, 0, 0, -13.14], [0.3039, 0, 0, 0, 25.13, 15.66, 0, 1.284, 0, 0, 0, 7.609, 6.296, 0] ]) # Compensate for the mismatching different state definition if self.ball_z_dim_mismatch: ctrl_gains = insert_tensor_col(ctrl_gains, 7, to.zeros((5, 1))) # ball z position ctrl_gains = insert_tensor_col(ctrl_gains, -1, to.zeros((5, 1))) # ball z velocity elif isinstance(inner_env(self._env), QBallBalancerSim): # Since the control module can by tricky to install (recommended using anaconda), we only load it if needed import control # System modeling dp = self._env.domain_param dp['J_eq'] = self._env._J_eq dp['B_eq_v'] = self._env._B_eq_v dp['c_kin'] = self._env._c_kin dp['zeta'] = self._env._zeta dp['A_m'] = self._env._A_m A = np.zeros((self._env.obs_space.flat_dim, self._env.obs_space.flat_dim)) A[:self._env.obs_space.flat_dim//2, self._env.obs_space.flat_dim//2:] = \ np.eye(self._env.obs_space.flat_dim//2) A[4, 4] = -dp['B_eq_v']/dp['J_eq'] A[5, 5] = -dp['B_eq_v']/dp['J_eq'] A[6, 0] = dp['c_kin']*dp['m_ball']*dp['g']*dp['r_ball']**2/dp['zeta'] A[6, 6] = -dp['c_kin']*dp['r_ball']**2/dp['zeta'] A[7, 1] = dp['c_kin']*dp['m_ball']*dp['g']*dp['r_ball']**2/dp['zeta'] A[7, 7] = -dp['c_kin']*dp['r_ball']**2/dp['zeta'] B = np.zeros((self._env.obs_space.flat_dim, self._env.act_space.flat_dim)) B[4, 0] = dp['A_m']/dp['J_eq'] B[5, 1] = dp['A_m']/dp['J_eq'] # C = np.zeros((self._env.obs_space.flat_dim // 2, self._env.obs_space.flat_dim)) # C[:self._env.obs_space.flat_dim // 2, :self._env.obs_space.flat_dim // 2] = # np.eye(self._env.obs_space.flat_dim // 2) # D = np.zeros((self._env.obs_space.flat_dim // 2, self._env.act_space.flat_dim)) # Get the weighting matrices from the environment if not isinstance(self._env.task.rew_fcn, QuadrErrRewFcn): # The environment uses a reward function compatible with the LQR Q = self._env.task.rew_fcn.Q R = self._env.task.rew_fcn.R else: # The environment does not use a reward function compatible with the LQR, apply some fine tuning Q = np.diag([1e2, 1e2, 5e2, 5e2, 1e-2, 1e-2, 5e+0, 5e+0]) R = np.diag([1e-2, 1e-2]) # Solve the continuous time Riccati eq K, _, self.eigvals = control.lqr(A, B, Q, R) # for discrete system pass dt ctrl_gains = to.from_numpy(K).to(to.get_default_dtype()) else: raise pyrado.TypeErr(given=inner_env(self._env), expected_type=[BallOnPlate5DSim, QBallBalancerSim]) # Assign the controller gains self._policy.init_param(-1*ctrl_gains) # in classical control it is u = -K*x; here a = psi(s)*s # Sample rollouts to evaluate the LQR ros = self.sampler.sample() # Logging rets = [ro.undiscounted_return() for ro in ros] self.logger.add_value('max return', np.max(rets), 4) self.logger.add_value('median return', np.median(rets), 4) self.logger.add_value('min return', np.min(rets), 4) self.logger.add_value('avg return', np.mean(rets), 4) self.logger.add_value('std return', np.std(rets), 4) self.logger.add_value('avg rollout len', np.mean([ro.length for ro in ros]), 4) self.logger.add_value('num total samples', self._cnt_samples) self.logger.add_value('min mag policy param', self._policy.param_values[to.argmin(abs(self._policy.param_values))]) self.logger.add_value('max mag policy param', self._policy.param_values[to.argmax(abs(self._policy.param_values))]) # Save snapshot data self.make_snapshot(snapshot_mode, float(np.mean(rets)), meta_info) def stopping_criterion_met(self) -> bool: """ Checks if the all eigenvalues of the closed loop system are negative. """ return (self.eigvals < 0).all() def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
def __init__( self, save_dir: pyrado.PathLike, env: Env, particle_hparam: dict, max_iter: int, num_particles: int, temperature: float, lr: float, horizon: int, std_init: float = 1.0, min_rollouts: int = None, min_steps: int = 10000, num_workers: int = 4, serial: bool = True, logger: StepLogger = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param particle_hparam: hyper-parameters for particle template construction :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_particles: number of distinct particles :param temperature: the temperature of the SVGD determines how jointly the training takes place :param lr: the learning rate for the update of the particles :param horizon: horizon for each particle :param std_init: initial standard deviation for the exploration :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_workers: number of environments for parallel sampling :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(particle_hparam, dict): raise pyrado.TypeErr(given=particle_hparam, expected_type=dict) if not all([key in particle_hparam for key in ["actor", "vfcn", "critic"]]): raise AttributeError # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy=None, logger=logger) # Store the inputs self._env = env self.num_particles = num_particles self.horizon = horizon self.lr = lr self.temperature = temperature self.serial = serial # Prepare placeholders for particles self.particles = [None] * num_particles self.particleSteps = [None] * num_particles self.expl_strats = [None] * num_particles self.optimizers = [None] * num_particles self.fixed_particles = [None] * num_particles self.fixed_expl_strats = [None] * num_particles self.samplers = [None] * num_particles self.count = 0 self.update_count = 0 # Particle factory actor = FNNPolicy(spec=env.spec, **particle_hparam["actor"]) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam["vfcn"]) critic = GAE(vfcn, **particle_hparam["critic"]) self.register_as_logger_parent(critic) particle = SVPGParticle(env.spec, actor, critic) for i in range(self.num_particles): self.particles[i] = deepcopy(particle) self.particles[i].init_param() self.expl_strats[i] = NormalActNoiseExplStrat(self.particles[i].actor, std_init) self.optimizers[i] = to.optim.Adam(self.expl_strats[i].parameters(), lr=self.lr) self.fixed_particles[i] = deepcopy(self.particles[i]) self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i]) self.particleSteps[i] = 0 if self.serial: self.samplers[i] = ParallelRolloutSampler( env, self.expl_strats[i], num_workers, min_rollouts=min_rollouts, min_steps=min_steps )
def __init__( self, save_dir: str, env: Env, policy: Policy, lr: float = 5e-4, std_init: float = 0.15, min_steps: int = 1500, num_epochs: int = 10, max_iter: int = 500, num_teachers: int = 8, teacher_extra: Optional[dict] = None, teacher_policy: Optional[Policy] = None, teacher_algo: Optional[callable] = None, teacher_algo_hparam: Optional[dict] = None, randomizer: Optional[DomainRandomizer] = None, logger: Optional[StepLogger] = None, num_workers: int = 4, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param std_init: initial standard deviation on the actions for the exploration noise :param min_steps: minimum number of state transitions sampled per policy update batch :param num_epochs: number of epochs (how often we iterate over the same batch) :param max_iter: number of iterations (policy updates) :param num_teachers: number of teachers that are used for distillation :param teacher_extra: extra dict from PDDRTeachers algo. If provided, teachers are loaded from there :param teacher_policy: policy to be updated (is duplicated for each teacher) :param teacher_algo: algorithm class to be used for training the teachers :param teacher_algo_hparam: hyper-params to be used for teacher_algo :param randomizer: randomizer for sampling the teacher domain parameters; if `None`, the environment's default one is used :param logger: logger for every step of the algorithm, if `None` the default logger will be created :param num_workers: number of environments for parallel sampling """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(policy, Policy): raise pyrado.TypeErr(given=policy, expected_type=Policy) # Call Algorithm's constructor. super().__init__( num_checkpoints=1, init_checkpoint=-1, save_dir=save_dir, max_iter=max_iter, policy=policy, logger=logger ) # Store the inputs self.env_real = env self.min_steps = min_steps self.num_epochs = num_epochs self.num_teachers = num_teachers self.num_workers = num_workers self.teacher_policies = [] self.teacher_envs = [] self.teacher_expl_strats = [] self.teacher_critics = [] self.teacher_ex_dirs = [] # Teachers if teacher_policy is not None and teacher_algo is not None and teacher_algo_hparam is not None: if not isinstance(teacher_policy, Policy): raise pyrado.TypeErr(given=teacher_policy, expected_type=Policy) if not issubclass(teacher_algo, Algorithm): raise pyrado.TypeErr(given=teacher_algo, expected_type=Algorithm) if randomizer is None: self.randomizer = create_default_randomizer(env) else: assert isinstance(randomizer, DomainRandomizer) self.randomizer = randomizer self.set_random_envs() # Prepare folders self.teacher_ex_dirs = [os.path.join(self.save_dir, f"teachers_{idx}") for idx in range(self.num_teachers)] for idx in range(self.num_teachers): os.makedirs(self.teacher_ex_dirs[idx], exist_ok=True) # Create teacher algos self.algos = [ teacher_algo( save_dir=self.teacher_ex_dirs[idx], env=self.teacher_envs[idx], policy=deepcopy(teacher_policy), logger=None, **deepcopy(teacher_algo_hparam), ) for idx in range(self.num_teachers) ] elif teacher_extra is not None: self.unpack_teachers(teacher_extra) assert self.num_teachers == len(self.teacher_policies) self.reset_checkpoint() else: self.load_teachers() if self.num_teachers < len(self.teacher_policies): print( f"You have loaded {len(self.teacher_policies)} teachers. Only the first {self.num_teachers} will be used!" ) self.prune_teachers() assert self.num_teachers == len(self.teacher_policies) self.reset_checkpoint() # Student self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.optimizer = to.optim.Adam([{"params": self.policy.parameters()}], lr=lr) # Environments self.samplers = [ ParallelRolloutSampler( self.teacher_envs[t], deepcopy(self._expl_strat), num_workers=self.num_workers, min_steps=self.min_steps, ) for t in range(self.num_teachers) ] self.teacher_weights = np.ones(self.num_teachers) # Distillation loss criterion self.criterion = to.nn.KLDivLoss(log_target=True, reduction="batchmean")
def __init__(self, save_dir: str, env: [SimEnv, StateAugmentationWrapper], subrtn: Algorithm, policy: Policy, expl_strat: StochasticActionExplStrat, max_iter: int, num_rollouts: int = None, steps_num: int = None, apply_dynamics_noise: bool = False, dyn_eps: float = 0.01, dyn_phi: float = 0.1, halfspan: float = 0.25, apply_proccess_noise: bool = False, proc_eps: float = 0.01, proc_phi: float = 0.05, apply_observation_noise: bool = False, obs_eps: float = 0.01, obs_phi: float = 0.05, torch_observation: bool = True, num_workers: int = 4, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment in which the agent should be trained :param subrtn: algorithm which performs the policy / value-function optimization :param policy: policy to be updated :param expl_strat: the exploration strategy :param max_iter: the maximum number of iterations :param num_rollouts: the number of rollouts to be performed for each update step :param steps_num: the number of steps to be performed for each update step :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied :param dyn_eps: the intensity of generated dynamics noise :param dyn_phi: the probability of applying dynamics noise :param halfspan: the halfspan of the uniform random distribution used to sample :param apply_proccess_noise: whether adversarially generated process noise should be applied :param proc_eps: the intensity of generated process noise :param proc_phi: the probability of applying process noise :param apply_observation_noise: whether adversarially generated observation noise should be applied :param obs_eps: the intensity of generated observation noise :param obs_phi: the probability of applying observation noise :param torch_observation: a function to provide a differentiable observation :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ assert isinstance(subrtn, Algorithm) assert isinstance(max_iter, int) and max_iter > 0 super().__init__(save_dir, max_iter, policy, logger) # Initialize adversarial wrappers if apply_dynamics_noise: assert isinstance(env, StateAugmentationWrapper) env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps, dyn_phi, halfspan) if apply_proccess_noise: env = AdversarialStateWrapper(env, self.policy, proc_eps, proc_phi, torch_observation=torch_observation) if apply_observation_noise: env = AdversarialObservationWrapper(env, self.policy, obs_eps, obs_phi) self.num_rollouts = num_rollouts self.sampler = ParallelRolloutSampler( env, expl_strat, num_workers=num_workers, min_steps=steps_num, min_rollouts=num_rollouts, ) # Subroutine self._subrtn = subrtn self._subrtn.save_name = 'subrtn'
def __init__(self, save_dir: str, env: Env, policy: TwoHeadedPolicy, qfcn_1: Policy, qfcn_2: Policy, memory_size: int, gamma: float, max_iter: int, num_batch_updates: Optional[int] = None, tau: float = 0.995, ent_coeff_init: float = 0.2, learn_ent_coeff: bool = True, target_update_intvl: int = 1, num_init_memory_steps: int = None, standardize_rew: bool = True, rew_scale: Union[int, float] = 1., min_rollouts: int = None, min_steps: int = None, batch_size: int = 256, num_workers: int = 4, max_grad_norm: float = 5., lr: float = 3e-4, lr_scheduler=None, lr_scheduler_hparam: Optional[dict] = None, logger: StepLogger = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param qfcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a re-initialized copies of this one :param qfcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a re-initialized copies of this one :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000 :param gamma: temporal discount factor for the state values :param max_iter: number of iterations (policy updates) :param num_batch_updates: number of (batched) gradient updates per algorithm step :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak update, between 0 and 1 :param ent_coeff_init: initial weighting factor of the entropy term in the loss function :param learn_ent_coeff: adapt the weighting factor of the entropy term :param target_update_intvl: number of iterations that pass before updating the target network :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to fill the buffer completely :param standardize_rew: if `True`, the rewards are standardized to be $~ N(0,1)$ :param rew_scale: scaling factor for the rewards, defaults no scaling :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param batch_size: number of samples per policy update batch :param num_workers: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step per `update()` call :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if typed_env(env, ActNormWrapper) is None: raise pyrado.TypeErr(msg='SAC required an environment wrapped by an ActNormWrapper!') if not isinstance(qfcn_1, Policy): raise pyrado.TypeErr(given=qfcn_1, expected_type=Policy) if not isinstance(qfcn_2, Policy): raise pyrado.TypeErr(given=qfcn_2, expected_type=Policy) # Call ValueBased's constructor super().__init__(save_dir, env, policy, memory_size, gamma, max_iter, num_batch_updates, target_update_intvl, num_init_memory_steps, min_rollouts, min_steps, batch_size, num_workers, max_grad_norm, logger) self.qfcn_1 = qfcn_1 self.qfcn_2 = qfcn_2 self.qfcn_targ_1 = deepcopy(self.qfcn_1).eval() # will not be trained using an optimizer self.qfcn_targ_2 = deepcopy(self.qfcn_2).eval() # will not be trained using an optimizer self.tau = tau self.learn_ent_coeff = learn_ent_coeff self.standardize_rew = standardize_rew self.rew_scale = rew_scale # Create sampler for exploration during training self._expl_strat = SACExplStrat(self._policy) self.sampler_trn = ParallelRolloutSampler( self._env, self._expl_strat, num_workers=num_workers if min_steps != 1 else 1, min_steps=min_steps, # in [2] this would be 1 min_rollouts=min_rollouts, # in [2] this would be None ) # Q-function optimizers self._optim_policy = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr, eps=1e-5) self._optim_qfcns = to.optim.Adam([{'params': self.qfcn_1.parameters()}, {'params': self.qfcn_2.parameters()}], lr=lr, eps=1e-5) # Automatic entropy tuning log_ent_coeff_init = to.log(to.tensor(ent_coeff_init, device=policy.device, dtype=to.get_default_dtype())) if learn_ent_coeff: self._log_ent_coeff = nn.Parameter(log_ent_coeff_init, requires_grad=True) self._ent_coeff_optim = to.optim.Adam([{'params': self._log_ent_coeff}], lr=lr, eps=1e-5) self.target_entropy = -to.prod(to.tensor(env.act_space.shape)) else: self._log_ent_coeff = log_ent_coeff_init # Learning rate scheduler self._lr_scheduler_policy = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler_policy = lr_scheduler(self._optim_policy, **lr_scheduler_hparam) self._lr_scheduler_qfcns = lr_scheduler(self._optim_qfcns, **lr_scheduler_hparam)
def __init__( self, subrtn: ParameterExploring, behavior_policy: Policy, num_rollouts_per_distr: int, metric: Union[Callable[[np.ndarray], np.ndarray], None], obs_dim_weight: Union[list, np.ndarray], std_obs_filt: int = 5, w_abs: float = 0.5, w_sq: float = 1.0, num_workers: int = 4, base_seed: int = 1001, ): """ Constructor :param subrtn: wrapped algorithm to fit the domain parameter distribution :param behavior_policy: lower level policy used to generate the rollouts :param num_rollouts_per_distr: number of rollouts per domain distribution parameter set :param metric: functional mapping from differences in observations to value :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions for the default metric :param std_obs_filt: number of standard deviations for the Gaussian filter applied to the observaitons :param w_abs: weight for the mean absolute errors for the default metric :param w_sq: weight for the mean squared errors for the default metric :param num_workers: number of environments for parallel sampling :param base_seed: seed to set for the parallel sampler in every iteration """ if not isinstance(subrtn, ParameterExploring): raise pyrado.TypeErr(given=subrtn, expected_type=ParameterExploring) if not isinstance(subrtn.env, MetaDomainRandWrapper): raise pyrado.TypeErr(given=subrtn.env, expected_type=MetaDomainRandWrapper) if not isinstance(subrtn.policy, DomainDistrParamPolicy): raise pyrado.TypeErr(given=subrtn.policy, expected_type=DomainDistrParamPolicy) if not isinstance(behavior_policy, Policy): raise pyrado.TypeErr(given=behavior_policy, expected_type=Policy) if subrtn.policy.num_param != len(subrtn.env.dp_mapping): raise pyrado.ShapeErr( msg= f"Number of policy parameters {subrtn.policy.num_param} does not match the" f"number of domain distribution parameters {len(subrtn.env.dp_mapping)}!" ) if subrtn.sampler.num_init_states_per_domain != 1: # Only sample one rollout in every domain. This is possible since we are synchronizing the init state. raise pyrado.ValueErr( given=subrtn.sampler.num_init_states_per_domain, eq_constraint="1") if num_rollouts_per_distr < 2: raise pyrado.ValueErr(given=num_rollouts_per_distr, g_constraint="1") if len(obs_dim_weight) != subrtn.env.obs_space.flat_dim: raise pyrado.ShapeErr(given=obs_dim_weight, expected_match=subrtn.env.obs_space) # Call Algorithm's constructor super().__init__(subrtn.save_dir, subrtn.max_iter, subrtn.policy, subrtn.logger) self._subrtn = subrtn self._subrtn.save_name = "subrtn" self._behavior_policy = behavior_policy self.obs_dim_weight = np.diag( obs_dim_weight ) # weighting factor between the different observations self.std_obs_filt = std_obs_filt if metric is None or metric == "None": self.metric = partial(self.weighted_l1_l2_metric, w_abs=w_abs, w_sq=w_sq, obs_dim_weight=self.obs_dim_weight) else: self.metric = metric # Get and optionally clip the observation bounds of the environment elb, eub = subrtn.env.obs_space.bound_lo, subrtn.env.obs_space.bound_up elb, eub = self.override_obs_bounds(elb, eub, subrtn.env.obs_space.labels) self.obs_normalizer = UnitCubeProjector(bound_lo=elb, bound_up=eub) # Create the sampler used to execute the same policy as on the real system in the meta-randomized env self.base_seed = base_seed self.behavior_sampler = ParallelRolloutSampler(self._subrtn.env, self._behavior_policy, num_workers=num_workers, min_rollouts=1, seed=base_seed) self.num_rollouts_per_distr = num_rollouts_per_distr
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma", [None, 0.99, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical("hidden_sizes_policy", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) # Critic vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) critic_hparam = dict( batch_size=250, gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0), lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0), num_epoch=trial.suggest_int("num_epoch_critic", 1, 10), lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3), standardize_adv=trial.suggest_categorical("standardize_adv_critic", [True, False]), max_grad_norm=trial.suggest_categorical("max_grad_norm_critic", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=300, batch_size=250, min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) * env.max_steps, num_epoch=trial.suggest_int("num_epoch_algo", 1, 10), eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2), std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical("max_grad_norm_algo", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1/100., max_steps=600) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # FNN # policy_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]), # ) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) # vfcn_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]), # ) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( batch_size=500, gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]), max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=250, batch_size=500, min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}')) algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Union[Policy, TwoHeadedPolicy], memory_size: int, gamma: float, max_iter: int, num_updates_per_step: int, target_update_intvl: int, num_init_memory_steps: int, min_rollouts: int, min_steps: int, batch_size: int, eval_intvl: int, max_grad_norm: float, num_workers: int, logger: StepLogger, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000 :param gamma: temporal discount factor for the state values :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_updates_per_step: number of (batched) gradient updates per algorithm step :param target_update_intvl: number of iterations that pass before updating the target network :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to fill the buffer completely :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param batch_size: number of samples per policy update batch :param eval_intvl: interval in which the evaluation rollouts are collected, also the interval in which the logger prints the summary statistics :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(memory_size, int): raise pyrado.TypeErr(given=memory_size, expected_type=int) if not (num_init_memory_steps is None or isinstance(num_init_memory_steps, int)): raise pyrado.TypeErr(given=num_init_memory_steps, expected_type=int) if logger is None: # Create logger that only logs every logger_print_intvl steps of the algorithm logger = StepLogger(print_intvl=eval_intvl) logger.printers.append(ConsolePrinter()) logger.printers.append( CSVPrinter(osp.join(save_dir, "progress.csv"))) logger.printers.append(TensorBoardPrinter(osp.join(save_dir, "tb"))) # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) self._env = env self._memory = ReplayMemory(memory_size) self.gamma = gamma self.target_update_intvl = target_update_intvl self.batch_size = batch_size self.max_grad_norm = max_grad_norm if num_init_memory_steps is None: self.num_init_memory_steps = memory_size else: self.num_init_memory_steps = max( min(num_init_memory_steps, memory_size), batch_size) # Heuristic for number of gradient updates per step if num_updates_per_step is None: self.num_batch_updates = ceil( min_steps / env.max_steps) if min_steps is not None else min_rollouts else: self.num_batch_updates = num_updates_per_step # Create sampler for initial filling of the replay memory if policy.is_recurrent: self.init_expl_policy = RecurrentDummyPolicy( env.spec, policy.hidden_size) else: self.init_expl_policy = DummyPolicy(env.spec) self.sampler_init = ParallelRolloutSampler( self._env, self.init_expl_policy, num_workers=num_workers, min_steps=self.num_init_memory_steps, ) # Create sampler for initial filling of the replay memory and evaluation self.sampler_eval = ParallelRolloutSampler( self._env, self._policy, num_workers=num_workers, min_steps=None, min_rollouts=100, show_progress_bar=True, ) self._expl_strat = None # must be implemented by subclass self._sampler = None # must be implemented by subclass
class ValueBased(Algorithm, ABC): """Base class of all value-based algorithms""" def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Union[Policy, TwoHeadedPolicy], memory_size: int, gamma: float, max_iter: int, num_updates_per_step: int, target_update_intvl: int, num_init_memory_steps: int, min_rollouts: int, min_steps: int, batch_size: int, eval_intvl: int, max_grad_norm: float, num_workers: int, logger: StepLogger, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000 :param gamma: temporal discount factor for the state values :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_updates_per_step: number of (batched) gradient updates per algorithm step :param target_update_intvl: number of iterations that pass before updating the target network :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to fill the buffer completely :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param batch_size: number of samples per policy update batch :param eval_intvl: interval in which the evaluation rollouts are collected, also the interval in which the logger prints the summary statistics :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(memory_size, int): raise pyrado.TypeErr(given=memory_size, expected_type=int) if not (num_init_memory_steps is None or isinstance(num_init_memory_steps, int)): raise pyrado.TypeErr(given=num_init_memory_steps, expected_type=int) if logger is None: # Create logger that only logs every logger_print_intvl steps of the algorithm logger = StepLogger(print_intvl=eval_intvl) logger.printers.append(ConsolePrinter()) logger.printers.append( CSVPrinter(osp.join(save_dir, "progress.csv"))) logger.printers.append(TensorBoardPrinter(osp.join(save_dir, "tb"))) # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) self._env = env self._memory = ReplayMemory(memory_size) self.gamma = gamma self.target_update_intvl = target_update_intvl self.batch_size = batch_size self.max_grad_norm = max_grad_norm if num_init_memory_steps is None: self.num_init_memory_steps = memory_size else: self.num_init_memory_steps = max( min(num_init_memory_steps, memory_size), batch_size) # Heuristic for number of gradient updates per step if num_updates_per_step is None: self.num_batch_updates = ceil( min_steps / env.max_steps) if min_steps is not None else min_rollouts else: self.num_batch_updates = num_updates_per_step # Create sampler for initial filling of the replay memory if policy.is_recurrent: self.init_expl_policy = RecurrentDummyPolicy( env.spec, policy.hidden_size) else: self.init_expl_policy = DummyPolicy(env.spec) self.sampler_init = ParallelRolloutSampler( self._env, self.init_expl_policy, num_workers=num_workers, min_steps=self.num_init_memory_steps, ) # Create sampler for initial filling of the replay memory and evaluation self.sampler_eval = ParallelRolloutSampler( self._env, self._policy, num_workers=num_workers, min_steps=None, min_rollouts=100, show_progress_bar=True, ) self._expl_strat = None # must be implemented by subclass self._sampler = None # must be implemented by subclass @property def expl_strat(self) -> Union[SACExplStrat, EpsGreedyExplStrat]: return self._expl_strat @property def memory(self) -> ReplayMemory: """Get the replay memory.""" return self._memory def step(self, snapshot_mode: str, meta_info: dict = None): if self._memory.isempty: # Warm-up phase print_cbt_once("Collecting samples until replay memory if full.", "w") # Sample steps and store them in the replay memory ros = self.sampler_init.sample() self._memory.push(ros) else: # Sample steps and store them in the replay memory ros = self.sampler.sample() self._memory.push(ros) self._cnt_samples += sum([ro.length for ro in ros ]) # don't count the evaluation samples # Log metrics computed from the old policy (before the update) if self._curr_iter % self.logger.print_intvl == 0: ros = self.sampler_eval.sample() rets = [ro.undiscounted_return() for ro in ros] ret_max = np.max(rets) ret_med = np.median(rets) ret_avg = np.mean(rets) ret_min = np.min(rets) ret_std = np.std(rets) else: ret_max, ret_med, ret_avg, ret_min, ret_std = 5 * [ -pyrado.inf ] # dummy values self.logger.add_value("max return", ret_max, 4) self.logger.add_value("median return", ret_med, 4) self.logger.add_value("avg return", ret_avg, 4) self.logger.add_value("min return", ret_min, 4) self.logger.add_value("std return", ret_std, 4) self.logger.add_value("avg memory reward", self._memory.avg_reward(), 4) self.logger.add_value("avg rollout length", np.mean([ro.length for ro in ros]), 4) self.logger.add_value("num total samples", self._cnt_samples) # Save snapshot data self.make_snapshot(snapshot_mode, float(ret_avg), meta_info) # Use data in the memory to update the policy and the Q-functions self.update() @abstractmethod def update(self): raise NotImplementedError def reset(self, seed: Optional[int] = None): # Reset the exploration strategy, internal variables and the random seeds super().reset(seed) # Re-initialize samplers in case env or policy changed self.sampler_init.reinit(self._env, self.init_expl_policy) self.sampler.reinit(self._env, self._expl_strat) self.sampler_eval.reinit(self._env, self._policy) # Reset the replay memory self._memory.reset() def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, "env.pkl", self.save_dir) pyrado.save(self._expl_strat.policy, "policy.pt", self.save_dir, use_state_dict=True) else: pyrado.save( self._expl_strat.policy, "policy.pt", self.save_dir, prefix=meta_info.get("prefix", ""), suffix=meta_info.get("suffix", ""), use_state_dict=True, )
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( Mr=0.095 * 0.9, # 0.095*0.9 = 0.0855 Mp=0.024 * 1.1, # 0.024*1.1 = 0.0264 Lr=0.085 * 0.9, # 0.085*0.9 = 0.0765 Lp=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ('Mr', 'mean'), 1: ('Mr', 'std'), 2: ('Mp', 'mean'), 3: ('Mp', 'std'), 4: ('Lr', 'mean'), 5: ('Lr', 'std'), 6: ('Lp', 'mean'), 7: ('Lp', 'std') } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9885, lamda=0.9648, num_epoch=2, batch_size=500, standardize_adv=False, lr=5.792e-4, max_grad_norm=1., ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=200, min_steps=3 * 23 * env_sim.max_steps, num_epoch=7, eps_clip=0.0744, batch_size=500, std_init=0.9074, lr=3.446e-04, max_grad_norm=1., num_workers=1, ) subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20) prior = DomainRandomizer( NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom), NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom), NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom), NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom), ) ddp_policy = DomainDistrParamPolicy( mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=trial.suggest_categorical('ddp_policy_scale_params', [True, False]), ) subsubrtn_distr_hparam = dict( max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]), pop_size=trial.suggest_int('pop_size', 50, 500), num_rollouts=1, num_is_samples=trial.suggest_int('num_is_samples', 5, 20), expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]), extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0, 10), num_workers=1, ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) subsubrtn_distr = CEM(study_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam, logger=csv_logger) obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight], num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20, 100), num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=trial.suggest_categorical('algo_max_iter', [10]), num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts', [5]), warmstart=trial.suggest_categorical('algo_warmstart', [True]), thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn', [50]), subrtn_snapshot_mode='latest', ) algo = SimOpt(study_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam, logger=csv_logger) # Jeeeha algo.train(seed=args.seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env_real, algo.policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret