def test_magic_function_implementation_or(): a = CustomStoppingCriterion(None, "A") b = CustomStoppingCriterion(None, "B") for criterion, expected_str in [(a | a, "(A or A)"), (b | b, "(B or B)"), (a | b, "(A or B)"), (b | a, "(B or A)")]: assert isinstance(criterion, _OrStoppingCriterion) assert str(criterion) == expected_str
def test_magic_function_implementation_and(): a = CustomStoppingCriterion(None, "A") b = CustomStoppingCriterion(None, "B") for criterion, expected_str in [ (a & a, "(A and A)"), (b & b, "(B and B)"), (a & b, "(A and B)"), (b & a, "(B and A)"), ]: assert isinstance(criterion, _AndStoppingCriterion) assert str(criterion) == expected_str
def test_criterion_custom(is_met_expected): # Assigning to a variable in a closure would redefine the scope, so rather use a list as a holding. was_called = [False] algo_expected = "ABC" def criterion_fn(algo): was_called[0] = True assert algo == algo_expected return is_met_expected criterion = CustomStoppingCriterion(criterion_fn, "Name") assert str(criterion) == "Name" assert criterion.is_met(algo_expected) == is_met_expected assert was_called[0]
def __init__( self, save_dir: pyrado.PathLike, env_sim: MetaDomainRandWrapper, env_real: Union[RealEnv, EnvWrapper], subrtn: Algorithm, ddp_space: BoxSpace, max_iter: int, acq_fc: str, acq_restarts: int, acq_samples: int, acq_param: dict = None, num_init_cand: int = 5, mc_estimator: bool = True, num_eval_rollouts_real: int = 5, num_eval_rollouts_sim: int = 50, thold_succ: float = pyrado.inf, thold_succ_subrtn: float = -pyrado.inf, warmstart: bool = True, policy_param_init: Optional[to.Tensor] = None, valuefcn_param_init: Optional[to.Tensor] = None, subrtn_snapshot_mode: str = "best", num_workers: int = 4, logger: Optional[StepLogger] = None, ): """ Constructor .. note:: If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to initialize every of the policies with a pre-trained policy parameters use `policy_param_init`. :param save_dir: directory to save the snapshots i.e. the results in :param env_sim: randomized simulation environment a.k.a. source domain :param env_real: real-world environment a.k.a. target domain :param subrtn: algorithm which performs the policy / value-function optimization :param ddp_space: space holding the boundaries for the domain distribution parameters :param max_iter: maximum number of iterations :param acq_fc: Acquisition Function 'UCB': Upper Confidence Bound (default $\beta = 0.1$) 'EI': Expected Improvement 'PI': Probability of Improvement :param acq_restarts: number of restarts for optimizing the acquisition function :param acq_samples: number of initial samples for optimizing the acquisition function :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence bound (`False`) obtained from bootstrapping :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the subroutine until the threshold is exceeded or the for a given number of iterations :param warmstart: initialize the policy (and value function) parameters with the one of the previous iteration. This behavior can also be overruled by passing `init_policy_params` (and `valuefcn_param_init`) explicitly. :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if typed_env(env_sim, MetaDomainRandWrapper) is None: raise pyrado.TypeErr(given=env_sim, expected_type=MetaDomainRandWrapper) if not isinstance(subrtn, Algorithm): raise pyrado.TypeErr(given=subrtn, expected_type=Algorithm) if not isinstance(ddp_space, BoxSpace): raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace) if num_init_cand < 1: raise pyrado.ValueErr(given=num_init_cand, ge_constraint="1") # Call InterruptableAlgorithm's constructor without specifying the policy super().__init__( num_checkpoints=2, init_checkpoint=-2, save_dir=save_dir, max_iter=max_iter, policy=subrtn.policy, logger=logger, ) self._env_sim = env_sim self._env_real = env_real self._subrtn = subrtn self._subrtn.save_name = "subrtn" self.ddp_space = ddp_space self.ddp_projector = UnitCubeProjector( to.from_numpy( self.ddp_space.bound_lo).to(dtype=to.get_default_dtype()), to.from_numpy( self.ddp_space.bound_up).to(dtype=to.get_default_dtype()), ) self.cands = None # called x in the context of GPs self.cands_values = None # called y in the context of GPs self.argmax_cand = to.Tensor() self.acq_fcn_type = acq_fc.upper() self.acq_restarts = acq_restarts self.acq_samples = acq_samples self.acq_param = acq_param self.num_init_cand = num_init_cand self.mc_estimator = mc_estimator self.policy_param_init = policy_param_init self.valuefcn_param_init = valuefcn_param_init.detach( ) if valuefcn_param_init is not None else None self.warmstart = warmstart self.num_eval_rollouts_real = num_eval_rollouts_real self.num_eval_rollouts_sim = num_eval_rollouts_sim self.subrtn_snapshot_mode = subrtn_snapshot_mode self.thold_succ = to.tensor([thold_succ], dtype=to.get_default_dtype()) self.thold_succ_subrtn = to.tensor([thold_succ_subrtn], dtype=to.get_default_dtype()) self.max_subrtn_rep = 3 # number of tries to exceed thold_succ_subrtn during training in simulation self.curr_cand_value = -pyrado.inf # for the stopping criterion self.num_workers = int(num_workers) if self.policy_param_init is not None: if to.is_tensor(self.policy_param_init): self.policy_param_init.detach() else: self.policy_param_init = to.tensor(self.policy_param_init) # Save initial environments and the domain distribution parameter space self.save_snapshot(meta_info=None) pyrado.save(self.ddp_space, "ddp_space.pkl", self.save_dir) self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion( self._custom_stopping_criterion)
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, num_init_states_per_domain: int, num_domains: int, pop_size: Optional[int] = None, num_workers: int = 4, logger: Optional[StepLogger] = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param pop_size: number of solutions in the population, pass `None` to use a default that scales logarithmically with the number of policy parameters :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not (isinstance(pop_size, int) or pop_size is None): raise pyrado.TypeErr(given=pop_size, expected_type=int) if isinstance(pop_size, int) and pop_size <= 0: raise pyrado.ValueErr(given=pop_size, g_constraint="0") # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) self._env = env # Auto-select population size if needed if pop_size is None: pop_size = 4 + int(3 * np.log(policy.num_param)) print_cbt(f"Initialized population size to {pop_size}.", "y") self.pop_size = pop_size # Create sampler self._sampler = ParameterExplorationSampler( env, policy, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, num_workers=num_workers, ) # Stopping criterion self.ret_avg_stack = 1e3 * np.random.randn(20) # stack size = 20 self.thold_ret_std = 1e-1 # algorithm terminates if below for multiple iterations # Saving the best policy (this is not the mean for policy parameter exploration) self.best_policy_param = policy.param_values.clone() # Set this in subclasses self._expl_strat = None self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion( self._custom_stopping_criterion)
def __init__( self, save_dir: pyrado.PathLike, inputs: to.Tensor, targets: to.Tensor, policy: Policy, max_iter: int, max_iter_no_improvement: int = 30, optim_class=optim.Adam, optim_hparam: dict = None, loss_fcn=nn.MSELoss(), batch_size: int = 256, ratio_train: float = 0.8, max_grad_norm: Optional[float] = None, lr_scheduler=None, lr_scheduler_hparam: Optional[dict] = None, logger: StepLogger = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param inputs: input data set, where the samples are along the first dimension :param targets: target data set, where the samples are along the first dimension :param policy: Pyrado policy (subclass of PyTorch's Module) to train :param max_iter: maximum number of iterations :param max_iter_no_improvement: if the performance on the validation set did not improve for this many iterations, the policy is considered to have converged, i.e. training stops :param optim_class: PyTorch optimizer class :param optim_hparam: hyper-parameters for the PyTorch optimizer :param loss_fcn: loss function for training, by default `torch.nn.MSELoss()` :param batch_size: number of samples per policy update batch :param ratio_train: ratio of the training samples w.r.t. the total sample count :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(inputs, to.Tensor): raise pyrado.TypeErr(given=inputs, expected_type=to.Tensor) if not isinstance(targets, to.Tensor): raise pyrado.TypeErr(given=targets, expected_type=to.Tensor) if not isinstance(ratio_train, float): raise pyrado.TypeErr(given=ratio_train, expected_type=float) if not (0 < ratio_train < 1): raise pyrado.ValueErr(given=ratio_train, g_constraint="0", l_constraint="1") # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Construct the dataset (samples along rows) inputs = to.atleast_2d( inputs).T if inputs.ndimension() == 1 else inputs targets = to.atleast_2d( targets).T if targets.ndimension() == 1 else targets if inputs.shape[0] != targets.shape[0]: raise pyrado.ShapeErr(given=targets, expected_match=inputs) num_samples_all = inputs.shape[0] dataset = TensorDataset( inputs, targets) # shared for training and validation loaders # Create training and validation loader idcs_all = to.randperm(num_samples_all) num_samples_trn = int(ratio_train * num_samples_all) num_samples_val = num_samples_all - num_samples_trn idcs_trn, idcs_val = idcs_all[:num_samples_trn], idcs_all[ num_samples_trn:] self.loader_trn = DataLoader( dataset, batch_size=min(batch_size, num_samples_trn), drop_last=True, sampler=SubsetRandomSampler(idcs_trn), ) self.loader_val = DataLoader( dataset, batch_size=min(batch_size, num_samples_val), drop_last=True, sampler=SubsetRandomSampler(idcs_val), ) # Set defaults which can be overwritten by passing optim_hparam, and create the optimizer optim_hparam = merge_dicts( [dict(lr=5e-3, eps=1e-8, weight_decay=1e-4), optim_hparam]) self.optim = optim_class([{ "params": self._policy.parameters() }], **optim_hparam) self.batch_size = batch_size self.ratio_train = ratio_train self.loss_fcn = loss_fcn self.max_grad_norm = max_grad_norm self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None and lr_scheduler_hparam is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam) # Stopping criterion self._curr_loss_val = pyrado.inf self._best_loss_val = pyrado.inf self._cnt_iter_no_improvement = 0 self._max_iter_no_improvement = max_iter_no_improvement self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion( self._custom_stopping_criterion)
def step(self, snapshot_mode: str, meta_info: dict = None): if isinstance(inner_env(self._env), BallOnPlate5DSim): ctrl_gains = to.tensor( [ [0.1401, 0, 0, 0, -0.09819, -0.1359, 0, 0.545, 0, 0, 0, -0.01417, -0.04427, 0], [0, 0.1381, 0, 0.2518, 0, 0, -0.2142, 0, 0.5371, 0, 0.03336, 0, 0, -0.1262], [0, 0, 0.1414, 0.0002534, 0, 0, -0.0002152, 0, 0, 0.5318, 0, 0, 0, -0.0001269], [0, -0.479, -0.0004812, 39.24, 0, 0, -15.44, 0, -1.988, -0.001934, 9.466, 0, 0, -13.14], [0.3039, 0, 0, 0, 25.13, 15.66, 0, 1.284, 0, 0, 0, 7.609, 6.296, 0], ] ) # Compensate for the mismatching different state definition if self.ball_z_dim_mismatch: ctrl_gains = insert_tensor_col(ctrl_gains, 7, to.zeros((5, 1))) # ball z position ctrl_gains = insert_tensor_col(ctrl_gains, -1, to.zeros((5, 1))) # ball z velocity elif isinstance(inner_env(self._env), QBallBalancerSim): # Since the control module can by tricky to install (recommended using anaconda), we only load it if needed import control # System modeling dp = self._env.domain_param dp["J_eq"] = self._env._J_eq dp["B_eq_v"] = self._env._B_eq_v dp["c_kin"] = self._env._c_kin dp["zeta"] = self._env._zeta dp["A_m"] = self._env._A_m A = np.zeros((self._env.obs_space.flat_dim, self._env.obs_space.flat_dim)) A[: self._env.obs_space.flat_dim // 2, self._env.obs_space.flat_dim // 2 :] = np.eye( self._env.obs_space.flat_dim // 2 ) A[4, 4] = -dp["B_eq_v"] / dp["J_eq"] A[5, 5] = -dp["B_eq_v"] / dp["J_eq"] A[6, 0] = dp["c_kin"] * dp["ball_mass"] * dp["gravity_const"] * dp["ball_radius"] ** 2 / dp["zeta"] A[6, 6] = -dp["c_kin"] * dp["ball_radius"] ** 2 / dp["zeta"] A[7, 1] = dp["c_kin"] * dp["ball_mass"] * dp["gravity_const"] * dp["ball_radius"] ** 2 / dp["zeta"] A[7, 7] = -dp["c_kin"] * dp["ball_radius"] ** 2 / dp["zeta"] B = np.zeros((self._env.obs_space.flat_dim, self._env.act_space.flat_dim)) B[4, 0] = dp["A_m"] / dp["J_eq"] B[5, 1] = dp["A_m"] / dp["J_eq"] # C = np.zeros((self._env.obs_space.flat_dim // 2, self._env.obs_space.flat_dim)) # C[:self._env.obs_space.flat_dim // 2, :self._env.obs_space.flat_dim // 2] = # np.eye(self._env.obs_space.flat_dim // 2) # D = np.zeros((self._env.obs_space.flat_dim // 2, self._env.act_space.flat_dim)) # Get the weighting matrices from the environment if not isinstance(self._env.task.rew_fcn, QuadrErrRewFcn): # The environment uses a reward function compatible with the LQR Q = self._env.task.rew_fcn.Q R = self._env.task.rew_fcn.R else: # The environment does not use a reward function compatible with the LQR, apply some fine tuning Q = np.diag([1e2, 1e2, 5e2, 5e2, 1e-2, 1e-2, 5e0, 5e0]) R = np.diag([1e-2, 1e-2]) # Solve the continuous time Riccati eq K, _, self.eigvals = control.lqr(A, B, Q, R) # for discrete system pass dt ctrl_gains = to.from_numpy(K).to(to.get_default_dtype()) else: raise pyrado.TypeErr(given=inner_env(self._env), expected_type=[BallOnPlate5DSim, QBallBalancerSim]) # Assign the controller gains self._policy.init_param(-1 * ctrl_gains) # in classical control it is u = -K*x; here a = psi(s)*s # Sample rollouts to evaluate the LQR ros = self.sampler.sample() # Logging rets = [ro.undiscounted_return() for ro in ros] self.logger.add_value("max return", np.max(rets), 4) self.logger.add_value("median return", np.median(rets), 4) self.logger.add_value("min return", np.min(rets), 4) self.logger.add_value("avg return", np.mean(rets), 4) self.logger.add_value("std return", np.std(rets), 4) self.logger.add_value("avg rollout len", np.mean([ro.length for ro in ros]), 4) self.logger.add_value("num total samples", self._cnt_samples) self.logger.add_value( "min mag policy param", self._policy.param_values[to.argmin(abs(self._policy.param_values))] ) self.logger.add_value( "max mag policy param", self._policy.param_values[to.argmax(abs(self._policy.param_values))] ) # Save snapshot data self.make_snapshot(snapshot_mode, float(np.mean(rets)), meta_info) self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion(self._custom_stopping_criterion)
def __init__( self, save_dir: pyrado.PathLike, env: DomainRandWrapperBuffer, subrtn_cand: Algorithm, subrtn_refs: Algorithm, max_iter: int, alpha: float, beta: float, nG: int, nJ: int, ntau: int, nc_init: int, nr_init: int, sequence_cand: callable, sequence_refs: callable, warmstart_cand: bool = False, warmstart_refs: bool = True, cand_policy_param_init: Optional[to.Tensor] = None, cand_critic_param_init: Optional[to.Tensor] = None, num_bs_reps: int = 1000, studentized_ci: bool = False, base_seed: int = None, logger: Optional[StepLogger] = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param subrtn_cand: the algorithm that is called at every iteration of SPOTA to yield a candidate policy :param subrtn_refs: the algorithm that is called at every iteration of SPOTA to yield reference policies :param max_iter: maximum number of iterations that SPOTA algorithm runs. Each of these iterations includes multiple iterations of the subroutine. :param alpha: confidence level for the upper confidence bound (UCBOG) :param beta: optimality gap threshold for training :param nG: number of reference solutions :param nJ: number of samples for Monte-Carlo approximation of the optimality gap :param ntau: number of rollouts per domain parameter set :param nc_init: initial number of domains for training the candidate solution :param nr_init: initial number of domains for training the reference solutions :param sequence_cand: mathematical sequence for the number of domains for training the candidate solution :param sequence_refs: mathematical sequence for the number of domains for training the reference solutions :param warmstart_cand: flag if the next candidate solution should be initialized with the previous one :param warmstart_refs: flag if the reference solutions should be initialized with the current candidate :param cand_policy_param_init: initial policy parameter values for the candidate, set None to be random :param cand_critic_param_init: initial critic parameter values for the candidate, set None to be random :param num_bs_reps: number of replications for the statistical bootstrap :param studentized_ci: flag if a student T distribution should be applied for the confidence interval :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not typed_env(env, DomainRandWrapperBuffer ): # there is a domain randomization wrapper raise pyrado.TypeErr( msg= "There must be a DomainRandWrapperBuffer in the environment chain." ) if not isinstance(subrtn_cand, Algorithm): raise pyrado.TypeErr(given=subrtn_cand, expected_type=Algorithm) if not isinstance(subrtn_refs, Algorithm): raise pyrado.TypeErr(given=subrtn_refs, expected_type=Algorithm) # Call InterruptableAlgorithm's constructor without specifying the policy super().__init__(num_checkpoints=2, save_dir=save_dir, max_iter=max_iter, policy=None, logger=logger) # Get the randomized environment (recommended to make it the most outer one in the chain) self.env_dr = typed_env(env, DomainRandWrapperBuffer) # Candidate and reference solutions, and optimality gap self.Gn_diffs = None self.ucbog = pyrado.inf # upper confidence bound on the optimality gap self._subrtn_cand = subrtn_cand self._subrtn_refs = subrtn_refs assert id(self._subrtn_cand) != id(self._subrtn_refs) assert id(self._subrtn_cand.policy) != id(self._subrtn_refs.policy) assert id(self._subrtn_cand.expl_strat) != id( self._subrtn_refs.expl_strat) self._subrtn_cand.save_name = "subrtn_cand" self._subrtn_refs.save_name = "subrtn_refs" self.alpha = alpha self.beta = beta self.warmstart_cand = warmstart_cand self.warmstart_refs = warmstart_refs self.cand_policy_param_init = cand_policy_param_init.detach( ) if cand_policy_param_init is not None else None self.cand_critic_param_init = cand_critic_param_init.detach( ) if cand_critic_param_init is not None else None self.nG = nG self.nJ = nJ self.ntau = ntau self.nc_init = nc_init self.nr_init = nr_init self.seq_cand = sequence_cand self.seq_ref = sequence_refs self.num_bs_reps = num_bs_reps self.studentized_ci = studentized_ci self.base_seed = np.random.randint( low=10000) if base_seed is None else base_seed # Save initial environment and randomizer self.save_snapshot(meta_info=None) self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion( self._custom_stopping_criterion)