def __setstate__(self, state): # Call Algorithm's __setstate__() super().__setstate__(state) # Reconstruct the simulator for sbi try: rollouts_real = pyrado.load("rollouts_real.pkl", self._save_dir, prefix=f"iter_{self._curr_iter}") except FileNotFoundError: try: rollouts_real = pyrado.load( "rollouts_real.pkl", self._save_dir, prefix=f"iter_{self._curr_iter - 1}") except (FileNotFoundError, RuntimeError, pyrado.PathErr, pyrado.TypeErr, pyrado.ValueErr): rollouts_real = None self._setup_sbi(state["_sbi_prior"], rollouts_real) # sbi_prior is fine as it is # Reconstruct the tensorboard printer with the once from this algorithm summary_writer = state["_logger"].printers[2].writer assert isinstance(summary_writer, SummaryWriter) self.__dict__["_subrtn_sbi"]._summary_writer = summary_writer # Set the internal sbi construction callable given the predefined posterior hyper-parameter. self.__dict__["_subrtn_sbi"]._build_neural_net = sbiutils.posterior_nn( **self.posterior_hparam)
def _load_experiment(ex_dir: pyrado.PathLike): # Load the algorithm algo = Algorithm.load_snapshot(ex_dir) if not isinstance(algo, (NPDR, BayesSim)): raise pyrado.TypeErr(given=algo, expected_type=(NPDR, BayesSim)) # Load the prior and the data prior = pyrado.load("prior.pt", ex_dir) data_real = pyrado.load("data_real.pt", ex_dir) # Load the posteriors posteriors = [ SBIBase.load_posterior(ex_dir, idx_round=i, verbose=True) for i in range(algo.num_sbi_rounds) ] posteriors = remove_none_from_list( posteriors) # in case the algorithm terminated early if data_real.shape[0] > len(posteriors): print_cbt( f"Found {data_real.shape[0]} data sets but {len(posteriors)} posteriors. Truncated the superfluous data.", "y", ) data_real = data_real[:len(posteriors), :] # Artificially repeat the data (which was the same for every round) to later be able to use the same code data_real = data_real.repeat(len(posteriors), 1) assert data_real.shape[0] == len(posteriors) return algo, prior, data_real, posteriors
def train_argmax_policy( load_dir: pyrado.PathLike, env_sim: MetaDomainRandWrapper, subrtn: Algorithm, num_restarts: int, num_samples: int, policy_param_init: to.Tensor = None, valuefcn_param_init: to.Tensor = None, subrtn_snapshot_mode: str = "best", ) -> Policy: """ Train a policy based on the maximizer of the posterior mean. :param load_dir: directory to load from :param env_sim: simulation environment :param subrtn: algorithm which performs the policy / value-function optimization :param num_restarts: number of restarts for the optimization of the acquisition function :param num_samples: number of samples for the optimization of the acquisition function :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine :return: the final BayRn policy """ # Load the required data cands = pyrado.load("candidates.pt", load_dir) cands_values = pyrado.load("candidates_values.pt", load_dir).unsqueeze(1) ddp_space = pyrado.load("ddp_space.pkl", load_dir) if cands.shape[0] > cands_values.shape[0]: print_cbt( f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring the" f"candidates without evaluation for computing the argmax.", "y", ) cands = cands[:cands_values.shape[0], :] # Find the maximizer argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values, ddp_space, num_restarts, num_samples) # Set the domain randomizer env_sim.adapt_randomizer(argmax_cand.numpy()) # Reset the subroutine algorithm which includes resetting the exploration subrtn.reset() # Do a warm start subrtn.init_modules(warmstart=True, policy_param_init=policy_param_init, valuefcn_param_init=valuefcn_param_init) subrtn.train(snapshot_mode=subrtn_snapshot_mode, meta_info=dict(suffix="argmax")) return subrtn.policy
def eval_init_policies(self): """ Execute the trained initial policies on the target device and store the estimated return per candidate. The number of initial policies to evaluate is the number of found policies. """ # Crawl through the experiment's directory for root, dirs, files in os.walk(self.save_dir): dirs.clear() # prevents walk() from going into subdirectories found_policies = [p for p in files if p.startswith('init_') and p.endswith('_policy.pt')] found_cands = [c for c in files if c.startswith('init_') and c.endswith('_candidate.pt')] if not len(found_policies) == len(found_cands): raise pyrado.ValueErr(msg='Found a different number of initial policies than candidates!') elif len(found_policies) == 0: raise pyrado.ValueErr(msg='No policies or candidates found!') num_init_cand = len(found_cands) cands_values = to.empty(num_init_cand) # Load all found candidates to save them into a single tensor found_cands = natural_sort(found_cands) # the order is important since it determines the rows of the tensor cands = to.stack([to.load(osp.join(self.save_dir, c)) for c in found_cands]) # Evaluate learned policies from random candidates on the target environment (real-world) system for i in range(num_init_cand): policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'init_{i}')) cands_values[i] = self.eval_policy(self.save_dir, self._env_real, policy, self.mc_estimator, prefix=f'init_{i}', num_rollouts=self.num_eval_rollouts_real) # Save candidates's and their returns into tensors (policy is saved during training or exists already) # pyrado.save(cands, 'candidates', 'pt', self._save_dir, meta_info) pyrado.save(cands_values, 'candidates_values', 'pt', self.save_dir, meta_info=None) self.cands, self.cands_values = cands, cands_values
def init_modules(self, warmstart: bool, suffix: str = '', prefix: str = None, **kwargs): # Initialize the policy super().init_modules(warmstart, suffix, prefix, **kwargs) if prefix is None: prefix = f'iter_{self._curr_iter - 1}' tpi = kwargs.get('target_param_init', None) if warmstart and tpi is not None: self.qfcn_targ.init_param(tpi) elif warmstart and tpi is None and self._curr_iter > 0: self.qfcn_targ = pyrado.load(self.qfcn_targ, 'qfcn_target', 'pt', self.save_dir, meta_info=dict(prefix=prefix, suffix=suffix)) else: # Reset the target Q-function self.qfcn_targ.init_param()
def _handle_neg_samples(self, cand_rets: np.ndarray, refs_rets: np.ndarray, k: int, i: int) -> np.ndarray: """ Process negative optimality gap samples by Looking at the other Reference Solutions :param cand_rets: array of the candidate's return values :param refs_rets: array of the references' return values :param k: index of the reference solution :param i: index of the domain :return refs_rets: if a better reference has been round the associated value will be overwritten """ if refs_rets[k, i] < cand_rets[k, i]: print_cbt( f'\nReference {k + 1} is worse than the candidate on domain realization {i + 1}.\n' # 1-based index 'Trying to replace this reference at this realization with a different one', 'y') for other_k in range(self.nG): if other_k == k: # Do nothing for the bad solution that brought us here continue else: # Load a reference solution different from the the k-th other_ref = pyrado.load( self._subrtn_refs._policy, 'policy', 'pt', self.save_dir, dict(prefix=f'iter_{self._curr_iter}', suffix=f'ref_{other_k}')) other_ref_ret = 0 for r in range(self.nJ): # Set the same random seed pyrado.set_seed(self.base_seed + i * self.nJ + r) # Set the circular index for the particular realization self.env_dr.ring_idx = i # Do the rollout and collect the return ro_other_ref = rollout(self.env_dr, other_ref, eval=True) other_ref_ret += ro_other_ref.undiscounted_return( ) / self.nJ # average over nJ seeds # Store the value if value is better if other_ref_ret > refs_rets[k, i]: refs_rets[k, i] = other_ref_ret # If a better one was found, do not iterate over the remaining reference solutions break if refs_rets[k, i] > cand_rets[k, i]: # Found a different reference that achieves a higher return that the candidate print_cbt('Successfully handled a negative OG sample', 'g') else: refs_rets[k, i] = cand_rets[ k, i] # forces optimality gap sample to be 0 print_cbt( 'Unsuccessfully handled a negative OG sample: Set the value to 0', 'r') else: # Everything is as it should be pass return refs_rets
def init_modules(self, warmstart: bool, suffix: str = '', prefix: str = None, **kwargs): if prefix is None: prefix = f'iter_{self._curr_iter - 1}' ppi = kwargs.get('policy_param_init', None) vpi = kwargs.get('valuefcn_param_init', None) if warmstart and ppi is not None and vpi is not None: self._policy.init_param(ppi) self._critic.vfcn.init_param(vpi) print_cbt('Learning given an fixed parameter initialization.', 'w') elif warmstart and ppi is None and self._curr_iter > 0: self._policy = pyrado.load(self._policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=prefix, suffix=suffix)) self._critic.vfcn = pyrado.load(self._critic.vfcn, 'vfcn', 'pt', self.save_dir, meta_info=dict(prefix=prefix, suffix=suffix)) print_cbt( f'Learning given the results from iteration {self._curr_iter - 1}', 'w') else: # Reset the policy self._policy.init_param() self._critic.vfcn.init_param() print_cbt('Learning from scratch.', 'w')
def init_modules(self, warmstart: bool, suffix: str = "", prefix: str = None, **kwargs): # Initialize the policy super().init_modules(warmstart, suffix, prefix, **kwargs) if prefix is None: prefix = f"iter_{self._curr_iter - 1}" t1pi = kwargs.get("target1_param_init", None) t2pi = kwargs.get("target2_param_init", None) if warmstart and None not in (t1pi, t2pi): self.qfcn_targ_1.init_param(t1pi) self.qfcn_targ_2.init_param(t2pi) elif warmstart and None in (t1pi, t2pi) and self._curr_iter > 0: self.qfcn_targ_1 = pyrado.load( "qfcn_target1.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self.qfcn_targ_1 ) self.qfcn_targ_2 = pyrado.load( "qfcn_target2.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self.qfcn_targ_2 ) else: # Reset the target Q-functions self.qfcn_targ_1.init_param() self.qfcn_targ_2.init_param()
def init_modules(self, warmstart: bool, suffix: str = "", prefix: str = None, **kwargs): if prefix is None: prefix = f"iter_{self._curr_iter - 1}" ppi = kwargs.get("policy_param_init", None) vpi = kwargs.get("valuefcn_param_init", None) if warmstart and ppi is not None and vpi is not None: self._policy.init_param(ppi) self._critic.vfcn.init_param(vpi) print_cbt("Learning given an fixed parameter initialization.", "w") elif warmstart and ppi is None and self._curr_iter > 0: self._policy = pyrado.load("policy.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self._policy) self._critic.vfcn = pyrado.load( "vfcn.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self._critic.vfcn ) print_cbt(f"Learning given the results from iteration {self._curr_iter - 1}", "w") else: # Reset the policy self._policy.init_param() self._critic.vfcn.init_param() print_cbt("Learning from scratch.", "w")
def get_latest_proposal_prev_iter( self) -> Union[sbiutils.BoxUniform, DirectPosterior]: """ Get either the prior or the conditioned posterior from the (last round of) previous iteration. :return: proposal for simulating with sbi """ if self._curr_iter == 0 or (hasattr(self, "reset_proposal_each_iter") and self.reset_proposal_each_iter): proposal = self._sbi_prior else: prefix = f"iter_{self._curr_iter - 1}_round_{self.num_sbi_rounds - 1}" proposal = pyrado.load("posterior.pt", self._save_dir, prefix=prefix) return proposal
def load_rollouts_from_dir( ex_dir: str, key: Optional[str] = "rollout", file_exts: Tuple[str] = ("pt", "pkl") ) -> Tuple[List[StepSequence], List[str]]: """ Crawl through the given directory, sort the files, and load all rollouts, i.e. all files that include the key. :param ex_dir: directory, e.g. and experiment folder :param key: word or part of a word that needs to the in the name of a file for it to be loaded :param file_exts: file extensions to be considered for loading :return: list of loaded rollouts, and list of file names without extension """ if not osp.isdir(ex_dir): raise pyrado.PathErr(given=ex_dir) if not isinstance(key, str): raise pyrado.TypeErr(given=key, expected_type=str) if not is_iterable(file_exts): raise pyrado.TypeErr(given=file_exts, expected_type=Iterable) rollouts = [] names = [] for root, dirs, files in os.walk(ex_dir): dirs.clear() # prevents walk() from going into subdirectories natural_sort(files) for f in files: f_ext = f[f.rfind(".") + 1:] if key in f and f_ext in file_exts: name = f[:f.rfind(".")] names.append(name) rollouts.append(pyrado.load(f"{name}.{f_ext}", load_dir=root)) if not rollouts: raise pyrado.ValueErr(msg="No rollouts have been found!") if isinstance(rollouts[0], list): if not check_all_types_equal(rollouts): raise pyrado.TypeErr( msg= "Some rollout savings contain lists of rollouts, others don't!" ) # The rollout files contain lists of rollouts, flatten them rollouts = list(itertools.chain(*rollouts)) return rollouts, names
def __init__( self, rollouts_dir: str, embedding: Embedding, num_segments: int = None, len_segments: int = None, rand_init_rollout: bool = True, ): """ Constructor :param rollouts_dir: directory where to find the of pre-recorded rollouts :param num_segments: number of segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param embedding: embedding used for pre-processing the data before (later) passing it to the posterior :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param rand_init_rollout: if `True`, chose the first rollout at random, and then cycle through the list """ if not os.path.isdir(rollouts_dir): raise pyrado.PathErr(given=rollouts_dir) Serializable._init(self, locals()) super().__init__(None, None, embedding, num_segments, len_segments) # Crawl through the directory and load every file that starts with the word rollout rollouts_rec = [] for root, dirs, files in os.walk(rollouts_dir): dirs.clear() # prevents walk() from going into subdirectories rollouts_rec = [pyrado.load(name=f, load_dir=root) for f in files if f.startswith("rollout")] check_all_lengths_equal(rollouts_rec) if not rollouts_rec: raise pyrado.ValueErr(msg="No rollouts have been found!") self.rollouts_dir = rollouts_dir self.rollouts_rec = rollouts_rec self._ring_idx = np.random.randint(0, len(rollouts_rec)) if rand_init_rollout else 0 self._set_action_field(self.rollouts_rec)
def init_modules(self, warmstart: bool, suffix: str = '', prefix: str = None, **kwargs): """ Initialize the algorithm's learnable modules, e.g. a policy or value function. Overwrite this method if the algorithm uses a learnable module aside the policy, e.g. a value function. :param warmstart: if `True`, the algorithm starts learning with an initialization. This can either be the a fixed parameter vector, or the results of the previous iteration :param suffix: keyword for `meta_info` when loading from previous iteration :param prefix: keyword for `meta_info` when loading from previous iteration :param kwargs: keyword arguments for initialization, e.g. `policy_param_init` or `valuefcn_param_init` """ if prefix is None: prefix = f'iter_{self._curr_iter - 1}' ppi = kwargs.get('policy_param_init', None) if warmstart and ppi is not None: self._policy.init_param(ppi) print_cbt('Learning given an fixed parameter initialization.', 'w') elif warmstart and ppi is None and self._curr_iter > 0: self._policy = pyrado.load(self._policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=prefix, suffix=suffix)) print_cbt( f'Learning given the results from iteration {self._curr_iter - 1}', 'w') else: # Reset the policy self._policy.init_param() print_cbt('Learning from scratch.', 'w')
from pyrado.plotting.distribution import draw_posterior_pairwise_scatter from pyrado.utils.argparser import get_argparser if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() plt.rc("text", usetex=args.use_tex) if not isinstance(args.num_samples, int) or args.num_samples < 1: raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1") # NPDR ex_dir_npdr = os.path.join(pyrado.TEMP_DIR, "mg-ik", "npdr_time", "") algo = Algorithm.load_snapshot(ex_dir_npdr) if not isinstance(algo, NPDR): raise pyrado.TypeErr(given=algo, expected_type=NPDR) env_sim = inner_env(pyrado.load("env_sim.pkl", ex_dir_npdr)) prior_npdr = pyrado.load("prior.pt", ex_dir_npdr) posterior_npdr = algo.load_posterior(ex_dir_npdr, idx_iter=0, idx_round=6, obj=None, verbose=True) # CHOICE data_real_npdr = pyrado.load(f"data_real.pt", ex_dir_npdr, prefix="iter_0", verbose=True) # CHOICE domain_params_npdr, log_probs = SBIBase.eval_posterior( posterior_npdr, data_real_npdr, args.num_samples, normalize_posterior=False, # not necessary here
args.policy_name = f"iter_{args.iter}_policy" if args.init: args.policy_name = "init_policy" env_sim, policy, extra = load_experiment(ex_dir, args) # Create the domain parameter mapping dp_mapping = dict() if extra is not None: dp_counter = 0 for key in sorted(extra["hparams"]["dp_mapping"].keys()): dp = extra["hparams"]["dp_mapping"][key] if dp in extra["hparams"]["dp_selection"]: dp_mapping[dp_counter] = dp dp_counter += 1 pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=policy) # Reset the policy's domain parameter if desired prior, posterior = None, None if args.src_domain_param == "ml": ml_domain_param = pyrado.load("ml_domain_param.pkl", ex_dir, prefix=f"iter_{args.iter}") policy.reset(**dict(domain_param=ml_domain_param)) elif args.src_domain_param == "posterior": prefix_str = "" if args.iter == -1 and args.round == -1 else f"iter_{args.iter}_round_{args.round}" posterior = pyrado.load("posterior.pt", ex_dir, prefix=prefix_str) elif args.src_domain_param == "prior": prior = pyrado.load("prior.pt", ex_dir) elif args.src_domain_param == "nominal": policy.reset(**dict(domain_param=env_sim.get_nominal_domain_param()))
def train_policy_sim(self, domain_params: to.Tensor, prefix: str, cnt_rep: int, use_rec_init_states: bool = True) -> float: """ Train a policy in simulation for given hyper-parameters from the domain randomizer. :param domain_params: domain parameters sampled from the posterior [shape N x D where N is the number of samples and D is the number of domain parameters] :param prefix: set a prefix to the saved file name, use "" for no prefix :param cnt_rep: current repetition count, coming from the wrapper function :param use_rec_init_states: if `True`, the previous rollout will be loaded to extract the initial states, and sync them with the recorded ones :return: estimated return of the trained policy in the target domain """ if not (domain_params.ndim == 2 and domain_params.shape[1] == len(self.dp_mapping)): raise pyrado.ShapeErr(given=domain_params, expected_match=(-1, len(self.dp_mapping))) # Insert the domain parameters into the wrapped environment's buffer self.fill_domain_param_buffer(self._env_sim_trn, self.dp_mapping, domain_params) # Set the initial state spaces of the simulation environment to match the observed initial states if use_rec_init_states: rollouts_real = pyrado.load("rollouts_real.pkl", self._save_dir, prefix=prefix) init_states_real = np.stack( [ro.states[0, :] for ro in rollouts_real]) if not init_states_real.shape == ( len(rollouts_real), self._env_sim_trn.state_space.flat_dim): raise pyrado.ShapeErr( given=init_states_real, expected_match=(len(rollouts_real), self._env_sim_trn.state_space.flat_dim)) inner_env( self._env_sim_trn).init_space = DiscreteSpace(init_states_real) print_cbt( "The simulation environment's initial states have been set to the recorded ones.", "w") # Reset the subroutine algorithm which includes resetting the exploration self._cnt_samples += self._subrtn_policy.sample_count self._subrtn_policy.reset() # Propagate the updated training environment to the SamplerPool's workers if hasattr(self._subrtn_policy, "sampler"): self._subrtn_policy.sampler.reinit(env=self._env_sim_trn) else: raise pyrado.KeyErr(keys="sampler", container=self._subrtn_policy) # Do a warm start, but randomly reset the policy parameters if training failed once self._subrtn_policy.init_modules(self.warmstart and cnt_rep == 0) # Train a policy in simulation using the subroutine self._subrtn_policy.train( snapshot_mode=self._subrtn_policy_snapshot_mode, meta_info=dict(prefix=prefix)) # Return the estimated return of the trained policy in simulation assert len(self._env_sim_trn.buffer) == self.num_eval_samples self._env_sim_trn.ring_idx = 0 # don't reset the buffer to eval on the same domains as trained avg_ret_sim = self.eval_policy(None, self._env_sim_trn, self._subrtn_policy.policy, prefix, self.num_eval_samples) return float(avg_ret_sim)
.. seealso:: [1] https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html [2] https://pytorch.org/tutorials/advanced/cpp_export.html [3[ https://pytorch.org/docs/stable/jit.html """ import pyrado from pyrado.logger.experiment import ask_for_experiment from pyrado.utils.argparser import get_argparser from pyrado.utils.experiments import cpp_export, load_experiment if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() # Get the experiment's directory to load from ex_dir = ask_for_experiment(hparam_list=args.show_hparams) if args.dir is None else args.dir # Load the policy (trained in simulation) try: # First try to load a "proper" experiment env, policy, _ = load_experiment(ex_dir) except (pyrado.PathErr, FileNotFoundError): # Try to load the policy and environment directly policy = pyrado.load("policy.pt", ex_dir, verbose=True) # no state_dict loading env = pyrado.load("env.pkl", ex_dir, verbose=True) # Export the policy to C++ and the experiment's config cpp_export(ex_dir, policy, env)
def collect_data_real( save_dir: Optional[pyrado.PathLike], env: Union[Env, str], policy: Policy, embedding: Embedding, num_rollouts: int, num_segments: int = None, len_segments: int = None, prefix: str = "", ) -> Tuple[to.Tensor, List[StepSequence]]: """ Roll-out a (behavioral) policy on the target system for later use with the sbi module, and save the data computed from the recorded rollouts. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance, in case you want to use pre-recorded rollouts pass the path to the parent folder as string :param policy: policy to evaluate :param embedding: embedding used for pre-processing the data before passing it to the posterior :param num_rollouts: number of rollouts to collect on the target system :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :return: data from the real-world rollouts a.k.a. set of $x_o$ of shape [num_iter, num_rollouts_per_iter, time_series_length, dim_data], and the real-world rollouts """ if not (isinstance(inner_env(env), RealEnv) or isinstance(inner_env(env), SimEnv) or isinstance(env, str)): raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv, str]) # Evaluate sequentially (necessary for sim-to-real experiments) if isinstance(env, str): rollout_worker = RecRolloutSamplerForSBI(env, embedding, num_segments, len_segments, rand_init_rollout=False) else: rollout_worker = RealRolloutSamplerForSBI(env, policy, embedding, num_segments, len_segments) # Initialize data containers data_real = None rollouts_real = None num_found_rollouts = 0 if save_dir is not None: try: data_real = pyrado.load("data_real.pt", save_dir, prefix=prefix) rollouts_real = pyrado.load("rollouts_real.pkl", save_dir, prefix=prefix) if not data_real.shape[0] == len(rollouts_real): raise pyrado.ShapeErr( msg= f"Found {data_real.shape[0]} entries in data_real.pt, but {len(rollouts_real)} rollouts in " f"rollouts_real.pkl!") num_found_rollouts = len(rollouts_real) print_cbt( f"Found {num_found_rollouts} rollout(s) in {save_dir}.", "w") except FileNotFoundError: pass # in the first attempt no files can be found collect_str = f"Collecting data" if prefix == "" else f"Collecting data using {prefix}_policy" for _ in tqdm( range(num_found_rollouts, num_rollouts), total=num_rollouts, desc=Fore.CYAN + Style.BRIGHT + collect_str + Style.RESET_ALL, unit="rollouts", file=sys.stdout, ): # Do the rollout data, rollout = rollout_worker() # Fill data container if data_real is None or rollouts_real is None: data_real = data # data is of shape [1, dim_feat] rollouts_real = [rollout] else: data_real = to.cat( [data_real, data], dim=1) # stack to final shape [1, num_rollouts * dim_feat] rollouts_real.append(rollout) # Optionally save the data (do this at every iteration to continue) if save_dir is not None: pyrado.save(data_real, "data_real.pt", save_dir, prefix=prefix) pyrado.save(rollouts_real, "rollouts_real.pkl", save_dir, prefix=prefix) if data_real.shape != (1, num_rollouts * embedding.dim_output): raise pyrado.ShapeErr(given=data_real, expected_match=(1, num_rollouts * embedding.dim_output)) return data_real, rollouts_real
def load_posterior( load_dir: pyrado.PathLike, idx_iter: int = -1, idx_round: int = -1, obj: Optional[Any] = None, verbose: bool = False, ) -> Optional[DirectPosterior]: """ Load the posterior of a given iteration (and round). :param load_dir: experiment's directory to crawl through :param idx_iter: iteration to load, to load the latest pass -1 :param idx_round: round to load, to load the latest pass -1, ignored if the experiment was not multi-round :param obj: object for state dict loading, forwarded to `pyrado.load()`, by default no state dict loading :param verbose: if `True`, print the path of what has been loaded, forwarded to `pyrado.load()` :return: loaded sbi posterior, or `None` if there is no posterior with the given iteration / round index """ if not os.path.isdir(load_dir): raise pyrado.PathErr(given=load_dir) if not isinstance(idx_iter, int): raise pyrado.TypeErr(given=idx_iter, expected_type=int) if not isinstance(idx_round, int): raise pyrado.TypeErr(given=idx_round, expected_type=int) if idx_iter == -1: # Check what is the latest iteration cnt_iter_max = -1 for root, dirs, files in os.walk(load_dir): dirs.clear() # prevents walk() from going into subdirectories for f in files: if f.startswith("iter_") and f.endswith("_posterior.pt"): cnt_iter = int(f[f.find("iter_") + len("iter_")]) cnt_iter_max = cnt_iter if cnt_iter > cnt_iter_max else cnt_iter_max idx_iter = cnt_iter_max # Check if the experiment was run in a multi-round setting multi_round_setting = False for root, dirs, files in os.walk(load_dir): dirs.clear() # prevents walk() from going into subdirectories for f in files: if f.startswith(f"iter_") and "round" in f: multi_round_setting = True break if multi_round_setting: if idx_round == -1: # Check what is the latest round cnt_round_max = -1 for root, dirs, files in os.walk(load_dir): dirs.clear( ) # prevents walk() from going into subdirectories for f in files: if "round" in f and f.endswith("_posterior.pt"): cnt_round = int(f[f.find("round_") + len("round_")]) cnt_round_max = cnt_round if cnt_round > cnt_round_max else cnt_round_max idx_round = cnt_round_max # Check before loading, and print a warning message if there can not be a posterior with the obtained indices if idx_iter == -1: print_cbt( f"Invalid iteration index {idx_iter}! Check if there is a posterior in {load_dir}.", "r") if idx_round == -1 and multi_round_setting: print_cbt( f"Invalid round index {idx_round}! Check if there is a posterior in {load_dir}.", "r") # Load the current posterior str_round = f"_round_{idx_round}" if multi_round_setting else "" try: posterior = pyrado.load( name=f"iter_{idx_iter}{str_round}_posterior.pt", load_dir=load_dir, obj=obj, verbose=verbose) except FileNotFoundError: print_cbt("No posterior was loaded.", "y") posterior = None return posterior
# Experiment ex_dir = setup_experiment(TSPred.name, LSTMPolicy.name) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Load the data if data_set_name == "skyline": dt = 0.01 _, vals = skyline(dt=dt, t_end=20.0, t_intvl_space=BoxSpace(0.5, 3, shape=(1, )), val_space=BoxSpace(-2.0, 3.0, shape=(1, ))) data = to.from_numpy(vals).to(dtype=to.get_default_dtype()).view(-1, 1) elif "qq-su" in data_set_name: data = pyrado.load("rollout_real_2021-04-14_18-34-53.pkl", osp.join(pyrado.EVAL_DIR, "qq-su_ectrl_250Hz")) assert isinstance(data, StepSequence) assert hasattr(data, "states") states = to.from_numpy(data.states).to(dtype=to.get_default_dtype()) actions = to.from_numpy(data.actions).to(dtype=to.get_default_dtype()) data = to.cat([states[:-1], actions], dim=1) # truncate final state else: data = pd.read_csv( osp.join(pyrado.PERMA_DIR, "misc", f"{data_set_name}.csv")) if data_set_name == "daily_min_temperatures": data = to.tensor(data["Temp"].values, dtype=to.get_default_dtype()).view(-1, 1) elif data_set_name == "monthly_sunspots": data = to.tensor(data["Sunspots"].values, dtype=to.get_default_dtype()).view(-1, 1) elif "oscillation" in data_set_name:
def _estimate_ucbog(self, nr: int): """ Collect the returns with synchronized random seeds and estimate the pessimistic and optimistic bound. :param nr: number of domains used for training the reference solutions :return: upper confidence bound on the optimality gap (UCBOG) """ # Init containers cand_rets = np.zeros((self.nG, nr)) refs_rets = np.zeros((self.nG, nr)) # Loop over all reference solutions for k in range(self.nG): print_cbt( f'Estimating the UCBOG | Reference {k + 1} of {self.nG} ...', 'c') # Load the domain parameters corresponding to the k-th reference solution env_params_ref = joblib.load( osp.join(self.save_dir, f'iter_{self._curr_iter}_env_params_ref_{k}.pkl')) self.env_dr.buffer = env_params_ref # Load the policies (makes a difference for snapshot_mode = best) self._subrtn_cand._policy = pyrado.load( self._subrtn_cand._policy, 'policy', 'pt', self.save_dir, dict(prefix=f'iter_{self._curr_iter}', suffix='cand')) self._subrtn_refs._policy = pyrado.load( self._subrtn_refs._policy, 'policy', 'pt', self.save_dir, dict(prefix=f'iter_{self._curr_iter}', suffix=f'ref_{k}')) # Loop over all domain realizations of the reference solutions for i in tqdm(range(nr), total=nr, desc=f'Reference {k + 1}', unit='domains', file=sys.stdout, leave=False): # Evaluate solutions cand_rets[k, i], refs_rets[ k, i] = self._eval_cand_and_ref_one_domain(i) # Process negative optimality samples refs_rets = self._handle_neg_samples(cand_rets, refs_rets, k, i) # -------------- # Optimality Gap # -------------- # This is similar to the difference of the means that is used to calculate the optimality gap in eq. (9) in [2] self.Gn_diffs = np.subtract( refs_rets, cand_rets) # optimistic bound - pessimistic bound; dim = nG x nr Gn_samples = np.mean(self.Gn_diffs, axis=1) # dim = 1 x nr Gn_est = np.mean( Gn_samples ) # sample mean of the original (non-bootstrapped) samples ratio_neg_diffs = 1 - np.count_nonzero( self.Gn_diffs ) / self.Gn_diffs.size # assuming zero come from clipping print_cbt(f'diffs (optimistic - pessimistic bound):\n{self.Gn_diffs}', 'y') print_cbt( f'\n{100*ratio_neg_diffs}% of the diffs would have been negative and were set to 0\n', 'r', bright=True) if ratio_neg_diffs == 1: # All diffs are negative ci_bs = [ 0, float('inf') ] # such that the UCBOG comparison in stopping_criterion_met() does not break log_dict = { 'Gn_est': np.NaN, 'UCBOG': np.NaN, 'ratio_neg_diffs': np.NaN } else: # Apply bootstrapping m_bs, ci_bs = bootstrap_ci(np.ravel(self.Gn_diffs), np.mean, self.num_bs_reps, self.alpha, 1, self.studentized_ci) print(f'm_bs: {m_bs}, ci_bs: {ci_bs}') print_cbt(f'\nOG (point estimate): {Gn_est} \nUCBOG: {ci_bs[1]}\n', 'y', bright=True) log_dict = { 'Gn_est': Gn_est, 'UCBOG': ci_bs[1], 'ratio_neg_diffs': ratio_neg_diffs } # Log the optimality gap data mode = 'w' if self.curr_iter == 0 else 'a' with open(osp.join(self.save_dir, 'OG_log.csv'), mode, newline='') as csvfile: fieldnames = list(log_dict.keys()) writer = csv.DictWriter(csvfile, fieldnames=fieldnames) if self.curr_iter == 0: writer.writeheader() writer.writerow(log_dict) # Store the current UCBOG estimated from all samples self.ucbog = ci_bs[1]
def step(self, snapshot_mode: str = "latest", meta_info: dict = None): # Save snapshot to save the correct iteration count self.save_snapshot() if self.curr_checkpoint == -1: if self._subrtn_policy is not None and self._train_initial_policy: # Add dummy values of variables that are logger later self.logger.add_value("avg log prob", -pyrado.inf) # Train the behavioral policy using the samples obtained from the prior. # Repeat the training if the resulting policy did not exceed the success threshold. domain_params = self._sbi_prior.sample( sample_shape=(self.num_eval_samples, )) print_cbt( "Training the initial policy using domain parameter sets sampled from prior.", "c") wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn, self.max_subrtn_rep)(self.train_policy_sim) wrapped_trn_fcn( domain_params, prefix="init", use_rec_init_states=False) # overrides policy.pt self.reached_checkpoint() # setting counter to 0 if self.curr_checkpoint == 0: # Check if the rollout files already exist if (osp.isfile( osp.join(self._save_dir, f"iter_{self.curr_iter}_data_real.pt")) and osp.isfile(osp.join(self._save_dir, "data_real.pt")) and osp.isfile( osp.join(self._save_dir, "rollouts_real.pkl"))): # Rollout files do exist (can be when continuing a previous experiment) self._curr_data_real = pyrado.load( "data_real.pt", self._save_dir, prefix=f"iter_{self.curr_iter}") print_cbt( f"Loaded existing rollout data for iteration {self.curr_iter}.", "w") else: # If the policy depends on the domain-parameters, reset the policy with the # most likely dp-params from the previous round. pyrado.load( "policy.pt", self._save_dir, prefix=f"iter_{self._curr_iter - 1}" if self.curr_iter != 0 else "init", obj=self._policy, ) if self.curr_iter != 0: ml_domain_param = pyrado.load( "ml_domain_param.pkl", self.save_dir, prefix=f"iter_{self._curr_iter - 1}") self._policy.reset(**dict(domain_param=ml_domain_param)) # Rollout files do not exist yet (usual case) self._curr_data_real, _ = SBIBase.collect_data_real( self.save_dir, self._env_real, self._policy, self._embedding, prefix=f"iter_{self._curr_iter}", num_rollouts=self.num_real_rollouts, num_segments=self.num_segments, len_segments=self.len_segments, ) # Save the target domain data if self._curr_iter == 0: # Append the first set of data pyrado.save(self._curr_data_real, "data_real.pt", self._save_dir) else: # Append and save all data prev_data = pyrado.load("data_real.pt", self._save_dir) data_real_hist = to.cat([prev_data, self._curr_data_real], dim=0) pyrado.save(data_real_hist, "data_real.pt", self._save_dir) # Initialize sbi simulator and prior self._setup_sbi( prior=self._sbi_prior, rollouts_real=pyrado.load("rollouts_real.pkl", self._save_dir, prefix=f"iter_{self._curr_iter}"), ) self.reached_checkpoint() # setting counter to 1 if self.curr_checkpoint == 1: # Instantiate the sbi subroutine to retrain from scratch each iteration if self.reset_sbi_routine_each_iter: self._initialize_subrtn_sbi( subrtn_sbi_class=SNPE_A, num_components=self._num_components) # Initialize the proposal with the prior proposal = self._sbi_prior # Multi-round sbi for idx_r in range(self.num_sbi_rounds): # Sample parameters proposal, and simulate these parameters to obtain the data domain_param, data_sim = simulate_for_sbi( simulator=self._sbi_simulator, proposal=proposal, num_simulations=self.num_sim_per_round, simulation_batch_size=self.simulation_batch_size, num_workers=self.num_workers, ) self._cnt_samples += self.num_sim_per_round * self._env_sim_sbi.max_steps # Append simulations and proposals for sbi self._subrtn_sbi.append_simulations( domain_param, data_sim, proposal= proposal, # do not pass proposal arg for SNLE or SNRE ) # Train the posterior density_estimator = self._subrtn_sbi.train( final_round=idx_r == self.num_sbi_rounds - 1, component_perturbation=self._component_perturbation, **self.subrtn_sbi_training_hparam, ) posterior = self._subrtn_sbi.build_posterior( density_estimator=density_estimator, **self.subrtn_sbi_sampling_hparam) # Save the posterior of this iteration before tailoring it to the data (when it is still amortized) if idx_r == 0: pyrado.save( posterior, "posterior.pt", self._save_dir, prefix=f"iter_{self._curr_iter}", ) # Set proposal of the next round to focus on the next data set. # set_default_x() expects dim [1, num_rollouts * data_samples] proposal = posterior.set_default_x(self._curr_data_real) # Save the posterior tailored to each round pyrado.save( posterior, "posterior.pt", self._save_dir, prefix=f"iter_{self._curr_iter}_round_{idx_r}", ) # Override the latest posterior pyrado.save(posterior, "posterior.pt", self._save_dir) self.reached_checkpoint() # setting counter to 2 if self.curr_checkpoint == 2: # Logging (the evaluation can be time-intensive) posterior = pyrado.load("posterior.pt", self._save_dir) self._curr_domain_param_eval, log_probs = SBIBase.eval_posterior( posterior, self._curr_data_real, self.num_eval_samples, calculate_log_probs=True, normalize_posterior=self.normalize_posterior, subrtn_sbi_sampling_hparam=self.subrtn_sbi_sampling_hparam, ) self.logger.add_value("avg log prob", to.mean(log_probs), 4) self.logger.add_value("num total samples", self._cnt_samples) # Extract the most likely domain parameter set out of all target domain data sets current_domain_param = self._env_sim_sbi.domain_param idx_ml = to.argmax(log_probs).item() dp_vals = self._curr_domain_param_eval[idx_ml // self.num_eval_samples, idx_ml % self.num_eval_samples, :] dp_vals = to.atleast_1d(dp_vals).numpy() ml_domain_param = dict( zip(self.dp_mapping.values(), dp_vals.tolist())) # Update the unchanged domain parameters with the most likely ones obtained from the posterior current_domain_param.update(ml_domain_param) pyrado.save(current_domain_param, "ml_domain_param.pkl", self.save_dir, prefix=f"iter_{self._curr_iter}") self.reached_checkpoint() # setting counter to 3 if self.curr_checkpoint == 3: # Policy optimization if self._subrtn_policy is not None: pyrado.load( "policy.pt", self._save_dir, prefix=f"iter_{self._curr_iter - 1}" if self.curr_iter != 0 else "init", obj=self._policy, ) # Train the behavioral policy using the posterior samples obtained before. # Repeat the training if the resulting policy did not exceed the success threshold. print_cbt( "Training the next policy using domain parameter sets sampled from the current posterior.", "c") wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn, self.max_subrtn_rep)(self.train_policy_sim) wrapped_trn_fcn(self._curr_domain_param_eval.squeeze(0), prefix=f"iter_{self._curr_iter}", use_rec_init_states=True) else: # save prefixed policy either way pyrado.save(self.policy, "policy.pt", self.save_dir, prefix=f"iter_{self._curr_iter}", use_state_dict=True) self.reached_checkpoint() # setting counter to 0 # Save snapshot data self.make_snapshot(snapshot_mode, None, meta_info)
def step(self, snapshot_mode: str = 'latest', meta_info: dict = None): # Save snapshot to save the correct iteration count self.save_snapshot() if self.curr_checkpoint == 0: if self._curr_iter == 0: # First iteration, use the policy parameters (initialized from a prior) cand = self._subrtn_distr.policy.transform_to_ddp_space( self._subrtn_distr.policy.param_values) self.cands = cand.unsqueeze(0) else: # Select the latest domain distribution parameter set assert isinstance(self.cands, to.Tensor) cand = self.cands[-1, :].clone() print_cbt( f'Current domain distribution parameters: {cand.detach().cpu().numpy()}', 'g') # Train and evaluate the behavioral policy, repeat if the policy did not exceed the success threshold wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn.item(), self.max_subrtn_rep)(self.train_policy_sim) wrapped_trn_fcn(cand, prefix=f'iter_{self._curr_iter}') # Save the latest behavioral policy self._subrtn_policy.save_snapshot() self.reached_checkpoint() # setting counter to 1 if self.curr_checkpoint == 1: # Evaluate the current policy in the target domain policy = pyrado.load( self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'iter_{self._curr_iter}')) self.eval_behav_policy(self.save_dir, self._env_real, policy, f'iter_{self._curr_iter}', self.num_eval_rollouts, None) # if self._curr_iter == 0: # # First iteration, also evaluate the random initialization # self.cands_values = SimOpt.eval_ddp_policy( # rollouts_real, self._env_sim, self.num_eval_rollouts, self._subrtn_distr, self._subrtn_policy # ) # self.cands_values = to.tensor(self.cands_values).unsqueeze(0) self.reached_checkpoint() # setting counter to 2 if self.curr_checkpoint == 2: # Train and evaluate the policy that represents domain parameter distribution rollouts_real = pyrado.load( None, 'rollouts_real', 'pkl', self.save_dir, meta_info=dict(prefix=f'iter_{self._curr_iter}')) curr_cand_value = self.train_ddp_policy( rollouts_real, prefix=f'iter_{self._curr_iter}') if self._curr_iter == 0: self.cands_values = to.tensor(curr_cand_value).unsqueeze(0) else: self.cands_values = to.cat([ self.cands_values, to.tensor(curr_cand_value).unsqueeze(0) ], dim=0) pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info) # The next candidate is the current search distribution and not the best policy parameter set (is saved) next_cand = self._subrtn_distr.policy.transform_to_ddp_space( self._subrtn_distr.policy.param_values) self.cands = to.cat([self.cands, next_cand.unsqueeze(0)], dim=0) pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info) # Save the latest domain distribution parameter policy self._subrtn_distr.save_snapshot( meta_info=dict(prefix='ddp', rollouts_real=rollouts_real)) self.reached_checkpoint() # setting counter to 0
if __name__ == "__main__": # Parse command line arguments parser = get_argparser() parser.set_defaults( animation=True) # different default value for this script args = parser.parse_args() if not isinstance(args.num_samples, int) or args.num_samples < 1: raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1") # Get the experiment's directory to load from ex_dir = ask_for_experiment( hparam_list=args.show_hparams) if args.dir is None else args.dir # Load the experiment env, policy, kwout = load_experiment(ex_dir, args) env_real = pyrado.load("env_real.pkl", ex_dir) data_real = kwout["data_real"] if args.iter == -1: # This script is not made to evaluate multiple iterations at once, thus we always select the data one iteration data_real = to.atleast_2d(data_real[args.iter]) # Override the time step size if specified if args.dt is not None: env.dt = args.dt # Use the environments number of steps in case of the default argument (inf) max_steps = env.max_steps if args.max_steps == pyrado.inf else args.max_steps # Check which algorithm was used in the experiment algo = Algorithm.load_snapshot(load_dir=ex_dir, load_name="algo") if not isinstance(algo, (NPDR, BayesSim)):
if isinstance(inner_env(env_real), SimEnv): # Use actual ground truth domain param if sim-2-sim setting domain_params = env_real.domain_param else: # Use nominal domain param if sim-2-real setting domain_params = inner_env(env_sim).get_nominal_domain_param() for dp_name, dp_val in domain_params.items(): if dp_name in labels_sel_dims[0]: gt_val_x = dp_val try: if dp_name == labels_sel_dims[1]: gt_val_y = dp_val except Exception: gt_val_y = None cands = pyrado.load("candidates.pt", ex_dir) cands_values = pyrado.load("candidates_values.pt", ex_dir).unsqueeze(1) ddp_space = pyrado.load("ddp_space.pkl", ex_dir) dim_cand = cands.shape[1] # number of domain distribution parameters if dim_cand % 2 != 0: raise pyrado.ShapeErr(msg="The dimension of domain distribution parameters must be a multiple of 2!") # Select dimensions to plot (ignored for 1D mode) if len(args.idcs) == 1: # Plot 1D x_label = labels_sel_dims[0] # could override manually here y_label = r"$\hat{J}^{\textrm{real}}$" fig, ax = plt.subplots(1, figsize=(6, 4), constrained_layout=True) elif len(args.idcs) == 2:
plt.rc("text", usetex=args.use_tex) if not isinstance(args.num_samples, int) or args.num_samples < 1: raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1") # Get the experiment's directory to load from ex_dir = ask_for_experiment( hparam_list=args.show_hparams) if args.dir is None else args.dir # Load the algorithm algo = Algorithm.load_snapshot(ex_dir) if not isinstance(algo, (NPDR, BayesSim)): raise pyrado.TypeErr(given=algo, expected_type=(NPDR, BayesSim)) # Load the environments, the policy, and the posterior env_sim, policy, kwout = load_experiment(ex_dir, args) env_real = pyrado.load("env_real.pkl", ex_dir) prior = kwout["prior"] posterior = kwout["posterior"] data_real = kwout["data_real"] if args.mode.lower() == "evolution-round" and args.iter == -1: args.iter = algo.curr_iter print_cbt( "Set the evaluation iteration to the latest iteration of the algorithm.", "y") # Load the sequence of posteriors if desired if args.mode.lower() == "evolution-iter": posterior = [ SBIBase.load_posterior(ex_dir, idx_iter=i, verbose=True) for i in range(algo.max_iter)
def load_experiment( ex_dir: str, args: Any = None) -> (Union[SimEnv, EnvWrapper], Policy, dict): """ Load the (training) environment and the policy. This helper function first tries to read the hyper-parameters yaml-file in the experiment's directory to infer why entities should be loaded. If no file was found, we fall back to some heuristic and hope for the best. :param ex_dir: experiment's parent directory :param args: arguments from the argument parser, pass `None` to fall back to the values from the default argparser :return: environment, policy, and optional output (e.g. valuefcn) """ env, policy, extra = None, None, dict() if args is None: # Fall back to default arguments. By passing [], we ignore the command line arguments args = get_argparser().parse_args([]) # Hyper-parameters hparams_file_name = 'hyperparams.yaml' try: hparams = load_dict_from_yaml(osp.join(ex_dir, hparams_file_name)) extra['hparams'] = hparams except (pyrado.PathErr, FileNotFoundError, KeyError): print_cbt( f'Did not find {hparams_file_name} in {ex_dir} or could not crawl the loaded hyper-parameters.', 'y', bright=True) # Algorithm specific algo = Algorithm.load_snapshot(load_dir=ex_dir, load_name='algo') if isinstance(algo, BayRn): # Environment env = pyrado.load(None, 'env_sim', 'pkl', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, 'env_sim.pkl')}.", 'g') if hasattr(env, 'randomizer'): last_cand = to.load(osp.join(ex_dir, 'candidates.pt'))[-1, :] env.adapt_randomizer(last_cand.numpy()) print_cbt(f'Loaded the domain randomizer\n{env.randomizer}', 'w') else: print_cbt('Loaded environment has no randomizer.', 'r') # Policy policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') # Extra (value function) if isinstance(algo.subroutine, ActorCritic): extra['vfcn'] = pyrado.load(algo.subroutine.critic.vfcn, f'{args.vfcn_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}", 'g') elif isinstance(algo, SPOTA): # Environment env = pyrado.load(None, 'env', 'pkl', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, 'env.pkl')}.", 'g') if hasattr(env, 'randomizer'): if not isinstance(env.randomizer, DomainRandWrapperBuffer): raise pyrado.TypeErr(given=env.randomizer, expected_type=DomainRandWrapperBuffer) typed_env(env, DomainRandWrapperBuffer).fill_buffer(100) print_cbt( f"Loaded {osp.join(ex_dir, 'env.pkl')} and filled it with 100 random instances.", 'g') else: print_cbt('Loaded environment has no randomizer.', 'r') # Policy policy = pyrado.load(algo.subroutine_cand.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') # Extra (value function) if isinstance(algo.subroutine_cand, ActorCritic): extra['vfcn'] = pyrado.load(algo.subroutine_cand.critic.vfcn, f'{args.vfcn_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}", 'g') elif isinstance(algo, SimOpt): # Environment env = pyrado.load(None, 'env_sim', 'pkl', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, 'env_sim.pkl')}.", 'g') if hasattr(env, 'randomizer'): last_cand = to.load(osp.join(ex_dir, 'candidates.pt'))[-1, :] env.adapt_randomizer(last_cand.numpy()) print_cbt(f'Loaded the domain randomizer\n{env.randomizer}', 'w') else: print_cbt('Loaded environment has no randomizer.', 'r') # Policy policy = pyrado.load(algo.subroutine_policy.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') # Extra (domain parameter distribution policy) extra['ddp_policy'] = pyrado.load(algo.subroutine_distr.policy, 'ddp_policy', 'pt', ex_dir, None) elif isinstance(algo, (EPOpt, UDR)): # Environment env = pyrado.load(None, 'env_sim', 'pkl', ex_dir, None) if hasattr(env, 'randomizer'): if not isinstance(env.randomizer, DomainRandWrapperLive): raise pyrado.TypeErr(given=env.randomizer, expected_type=DomainRandWrapperLive) print_cbt( f"Loaded {osp.join(ex_dir, 'env.pkl')} with DomainRandWrapperLive randomizer.", 'g') else: print_cbt('Loaded environment has no randomizer.', 'y') # Policy policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') # Extra (value function) if isinstance(algo.subroutine, ActorCritic): extra['vfcn'] = pyrado.load(algo.subroutine.critic.vfcn, f'{args.vfcn_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}", 'g') elif isinstance(algo, ActorCritic): # Environment env = pyrado.load(None, 'env', 'pkl', ex_dir, None) # Policy policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') # Extra (value function) extra['vfcn'] = pyrado.load(algo.critic.vfcn, f'{args.vfcn_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}", 'g') elif isinstance(algo, ParameterExploring): # Environment env = pyrado.load(None, 'env', 'pkl', ex_dir, None) # Policy policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') elif isinstance(algo, ValueBased): # Environment env = pyrado.load(None, 'env', 'pkl', ex_dir, None) # Policy policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') # Target value functions if isinstance(algo, DQL): extra['qfcn_target'] = pyrado.load(algo.qfcn_targ, 'qfcn_target', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, 'qfcn_target.pt')}", 'g') elif isinstance(algo, SAC): extra['qfcn_target1'] = pyrado.load(algo.qfcn_targ_1, 'qfcn_target1', 'pt', ex_dir, None) extra['qfcn_target2'] = pyrado.load(algo.qfcn_targ_2, 'qfcn_target2', 'pt', ex_dir, None) print_cbt( f"Loaded {osp.join(ex_dir, 'qfcn_target1.pt')} and {osp.join(ex_dir, 'qfcn_target2.pt')}", 'g') else: raise NotImplementedError elif isinstance(algo, SVPG): # Environment env = pyrado.load(None, 'env', 'pkl', ex_dir, None) # Policy policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir, None) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g') # Extra (particles) for idx, p in enumerate(algo.particles): extra[f'particle{idx}'] = pyrado.load(algo.particles[idx], f'particle_{idx}', 'pt', ex_dir, None) elif isinstance(algo, TSPred): # Dataset extra['dataset'] = to.load(osp.join(ex_dir, 'dataset.pt')) # Policy policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir, None) else: raise pyrado.TypeErr( msg= 'No matching algorithm name found during loading the experiment!') # Check if the return types are correct. They can be None, too. if env is not None and not isinstance(env, (SimEnv, EnvWrapper)): raise pyrado.TypeErr(given=env, expected_type=[SimEnv, EnvWrapper]) if policy is not None and not isinstance(policy, Policy): raise pyrado.TypeErr(given=policy, expected_type=Policy) if extra is not None and not isinstance(extra, dict): raise pyrado.TypeErr(given=extra, expected_type=dict) return env, policy, extra
def load_experiment( ex_dir: str, args: Any = None ) -> Tuple[Optional[Union[SimEnv, EnvWrapper]], Optional[Policy], Optional[dict]]: """ Load the (training) environment and the policy. This helper function first tries to read the hyper-parameters yaml-file in the experiment's directory to infer why entities should be loaded. If no file was found, we fall back to some heuristic and hope for the best. :param ex_dir: experiment's parent directory :param args: arguments from the argument parser, pass `None` to fall back to the values from the default argparser :return: environment, policy, and optional output (e.g. valuefcn) """ env, policy, extra = None, None, dict() if args is None: # Fall back to default arguments. By passing [], we ignore the command line arguments args = get_argparser().parse_args([]) # Hyper-parameters extra["hparams"] = load_hyperparameters(ex_dir) # Algorithm specific algo = Algorithm.load_snapshot(load_dir=ex_dir, load_name="algo") if algo.name == "spota": # Environment env = pyrado.load("env.pkl", ex_dir) if getattr(env, "randomizer", None) is not None: if not isinstance(env, DomainRandWrapperBuffer): raise pyrado.TypeErr(given=env, expected_type=DomainRandWrapperBuffer) typed_env(env, DomainRandWrapperBuffer).fill_buffer(10) print_cbt( f"Loaded the domain randomizer\n{env.randomizer}\nand filled it with 10 random instances.", "w") else: print_cbt("Loaded environment has no randomizer, or it is None.", "r") # Policy policy = pyrado.load(algo.subroutine_cand.policy, f"{args.policy_name}.pt", ex_dir, verbose=True) # Extra (value function) if isinstance(algo.subroutine_cand, ActorCritic): extra["vfcn"] = pyrado.load(algo.subroutine_cand.critic.vfcn, f"{args.vfcn_name}.pt", ex_dir, verbose=True) elif algo.name == "bayrn": # Environment env = pyrado.load("env_sim.pkl", ex_dir) if hasattr(env, "randomizer"): last_cand = to.load(osp.join(ex_dir, "candidates.pt"))[-1, :] env.adapt_randomizer(last_cand.numpy()) print_cbt(f"Loaded the domain randomizer\n{env.randomizer}", "w") else: print_cbt("Loaded environment has no randomizer, or it is None.", "r") # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) # Extra (value function) if isinstance(algo.subroutine, ActorCritic): extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt", ex_dir, obj=algo.subroutine.critic.vfcn, verbose=True) elif algo.name == "simopt": # Environment env = pyrado.load("env_sim.pkl", ex_dir) if getattr(env, "randomizer", None) is not None: last_cand = to.load(osp.join(ex_dir, "candidates.pt"))[-1, :] env.adapt_randomizer(last_cand.numpy()) print_cbt(f"Loaded the domain randomizer\n{env.randomizer}", "w") else: print_cbt("Loaded environment has no randomizer, or it is None.", "r") # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.subroutine_policy.policy, verbose=True) # Extra (domain parameter distribution policy) extra["ddp_policy"] = pyrado.load("ddp_policy.pt", ex_dir, obj=algo.subroutine_distr.policy, verbose=True) elif algo.name in ["epopt", "udr"]: # Environment env = pyrado.load("env_sim.pkl", ex_dir) if getattr(env, "randomizer", None) is not None: if not isinstance(env, DomainRandWrapperLive): raise pyrado.TypeErr(given=env, expected_type=DomainRandWrapperLive) print_cbt(f"Loaded the domain randomizer\n{env.randomizer}", "w") else: print_cbt("Loaded environment has no randomizer, or it is None.", "y") # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) # Extra (value function) if isinstance(algo.subroutine, ActorCritic): extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt", ex_dir, obj=algo.subroutine.critic.vfcn, verbose=True) elif algo.name in ["bayessim", "npdr"]: # Environment env = pyrado.load("env_sim.pkl", ex_dir) if getattr(env, "randomizer", None) is not None: if not isinstance(env, DomainRandWrapperBuffer): raise pyrado.TypeErr(given=env, expected_type=DomainRandWrapperBuffer) typed_env(env, DomainRandWrapperBuffer).fill_buffer(10) print_cbt( f"Loaded the domain randomizer\n{env.randomizer}\nand filled it with 10 random instances.", "w") else: print_cbt("Loaded environment has no randomizer, or it is None.", "y") env = remove_all_dr_wrappers(env, verbose=True) # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) # Extra (prior, posterior, data) extra["prior"] = pyrado.load("prior.pt", ex_dir, verbose=True) # By default load the latest posterior (latest iteration and the last round) try: extra["posterior"] = algo.load_posterior(ex_dir, args.iter, args.round, obj=None, verbose=True) # Load the complete data or the data of the given iteration prefix = "" if args.iter == -1 else f"iter_{args.iter}" extra["data_real"] = pyrado.load(f"data_real.pt", ex_dir, prefix=prefix, verbose=True) except FileNotFoundError: pass elif algo.name in ["a2c", "ppo", "ppo2"]: # Environment env = pyrado.load("env.pkl", ex_dir) # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) # Extra (value function) extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt", ex_dir, obj=algo.critic.vfcn, verbose=True) elif algo.name in ["hc", "pepg", "power", "cem", "reps", "nes"]: # Environment env = pyrado.load("env.pkl", ex_dir) # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) elif algo.name in ["dql", "sac"]: # Environment env = pyrado.load("env.pkl", ex_dir) # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) # Target value functions if algo.name == "dql": extra["qfcn_target"] = pyrado.load("qfcn_target.pt", ex_dir, obj=algo.qfcn_targ, verbose=True) elif algo.name == "sac": extra["qfcn_target1"] = pyrado.load("qfcn_target1.pt", ex_dir, obj=algo.qfcn_targ_1, verbose=True) extra["qfcn_target2"] = pyrado.load("qfcn_target2.pt", ex_dir, obj=algo.qfcn_targ_2, verbose=True) else: raise NotImplementedError elif algo.name == "svpg": # Environment env = pyrado.load("env.pkl", ex_dir) # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) # Extra (particles) for idx, p in enumerate(algo.particles): extra[f"particle{idx}"] = pyrado.load(f"particle_{idx}.pt", ex_dir, obj=algo.particles[idx], verbose=True) elif algo.name == "tspred": # Dataset extra["dataset"] = to.load(osp.join(ex_dir, "dataset.pt")) # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) elif algo.name == "sprl": # Environment env = pyrado.load("env.pkl", ex_dir) print_cbt(f"Loaded {osp.join(ex_dir, 'env.pkl')}.", "g") # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy) print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", "g") # Extra (value function) if isinstance(algo._subroutine, ActorCritic): extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt", ex_dir, obj=algo._subroutine.critic.vfcn, verbose=True) elif algo.name == "pddr": # Environment env = pyrado.load("env.pkl", ex_dir) # Policy policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy, verbose=True) # Teachers extra["teacher_policies"] = algo.teacher_policies extra["teacher_envs"] = algo.teacher_envs extra["teacher_expl_strats"] = algo.teacher_expl_strats extra["teacher_critics"] = algo.teacher_critics extra["teacher_ex_dirs"] = algo.teacher_ex_dirs else: raise pyrado.TypeErr( msg= "No matching algorithm name found during loading the experiment!") # Check if the return types are correct. They can be None, too. if env is not None and not isinstance(env, (SimEnv, EnvWrapper)): raise pyrado.TypeErr(given=env, expected_type=[SimEnv, EnvWrapper]) if policy is not None and not isinstance(policy, Policy): raise pyrado.TypeErr(given=policy, expected_type=Policy) if extra is not None and not isinstance(extra, dict): raise pyrado.TypeErr(given=extra, expected_type=dict) return env, policy, extra
def step(self, snapshot_mode: str = 'latest', meta_info: dict = None): # Save snapshot to save the correct iteration count self.save_snapshot() if self.curr_checkpoint == -2: # Train the initial policies in the source domain self.train_init_policies() self.reached_checkpoint() # setting counter to -1 if self.curr_checkpoint == -1: # Evaluate the initial policies in the target domain self.eval_init_policies() self.reached_checkpoint() # setting counter to 0 if self.curr_checkpoint == 0: # Normalize the input data and standardize the output data cands_norm = self.ddp_projector.project_to(self.cands) cands_values_stdized = standardize(self.cands_values).unsqueeze(1) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) print_cbt('Fitted the GP.', 'g') # Acquisition functions if self.acq_fcn_type == 'UCB': acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True) elif self.acq_fcn_type == 'EI': acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) elif self.acq_fcn_type == 'PI': acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) else: raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'") # Optimize acquisition function and get new candidate point cand_norm, acq_value = optimize_acqf( acq_function=acq_fcn, bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]), q=1, num_restarts=self.acq_restarts, raw_samples=self.acq_samples ) next_cand = self.ddp_projector.project_back(cand_norm) print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g') self.cands = to.cat([self.cands, next_cand], dim=0) pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 1 if self.curr_checkpoint == 1: # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn.item(), self.max_subrtn_rep )(self.train_policy_sim) wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}') self.reached_checkpoint() # setting counter to 2 if self.curr_checkpoint == 2: # Evaluate the current policy in the target domain policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'iter_{self._curr_iter}')) self.curr_cand_value = self.eval_policy( self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}', self.num_eval_rollouts_real ) self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0) pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info) # Store the argmax after training and evaluating curr_argmax_cand = BayRn.argmax_posterior_mean( self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples ) self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0) pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 0
if isinstance(inner_env(env_real), SimEnv): # Use actual ground truth domain param if sim-2-sim setting domain_params = env_real.domain_param else: # Use nominal domain param if sim-2-real setting domain_params = inner_env(env_sim).get_nominal_domain_param() for dp_name, dp_val in domain_params.items(): if dp_name in labels_sel_dims[0]: gt_val_x = dp_val try: if dp_name == labels_sel_dims[1]: gt_val_y = dp_val except Exception: gt_val_y = None cands = pyrado.load(None, 'candidates', 'pt', ex_dir) cands_values = pyrado.load(None, 'candidates_values', 'pt', ex_dir).unsqueeze(1) ddp_space = pyrado.load(None, 'ddp_space', 'pkl', ex_dir) dim_cand = cands.shape[1] # number of domain distribution parameters if dim_cand % 2 != 0: raise pyrado.ShapeErr( msg= 'The dimension of domain distribution parameters must be a multiple of 2!' ) # Select dimensions to plot (ignored for 1D mode) if len(args.idcs) == 1: # Plot 1D x_label = labels_sel_dims[0] # could override manually here