Пример #1
0
    def eval_init_policies(self):
        """
        Execute the trained initial policies on the target device and store the estimated return per candidate.
        The number of initial policies to evaluate is the number of found policies.
        """
        # Crawl through the experiment's directory
        for root, dirs, files in os.walk(self.save_dir):
            dirs.clear()  # prevents walk() from going into subdirectories
            found_policies = [p for p in files if p.startswith('init_') and p.endswith('_policy.pt')]
            found_cands = [c for c in files if c.startswith('init_') and c.endswith('_candidate.pt')]
        if not len(found_policies) == len(found_cands):
            raise pyrado.ValueErr(msg='Found a different number of initial policies than candidates!')
        elif len(found_policies) == 0:
            raise pyrado.ValueErr(msg='No policies or candidates found!')

        num_init_cand = len(found_cands)
        cands_values = to.empty(num_init_cand)

        # Load all found candidates to save them into a single tensor
        found_cands = natural_sort(found_cands)  # the order is important since it determines the rows of the tensor
        cands = to.stack([to.load(osp.join(self.save_dir, c)) for c in found_cands])

        # Evaluate learned policies from random candidates on the target environment (real-world) system
        for i in range(num_init_cand):
            policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'init_{i}'))
            cands_values[i] = self.eval_policy(self.save_dir, self._env_real, policy, self.mc_estimator,
                                               prefix=f'init_{i}', num_rollouts=self.num_eval_rollouts_real)

        # Save candidates's and their returns into tensors (policy is saved during training or exists already)
        # pyrado.save(cands, 'candidates', 'pt', self._save_dir, meta_info)
        pyrado.save(cands_values, 'candidates_values', 'pt', self.save_dir, meta_info=None)
        self.cands, self.cands_values = cands, cands_values
Пример #2
0
    def train_init_policies(self):
        """
        Initialize the algorithm with a number of random distribution parameter sets a.k.a. candidates specified by
        the user. Train a policy for every candidate. Finally, store the policies and candidates.
        """
        cands = to.empty(self.num_init_cand, self.ddp_space.shape[0])
        for i in range(self.num_init_cand):
            print_cbt(
                f"Generating initial domain instance and policy {i + 1} of {self.num_init_cand} ...",
                "g",
                bright=True)
            # Sample random domain distribution parameters
            cands[i, :] = to.from_numpy(self.ddp_space.sample_uniform())

            # Train a policy for each candidate, repeat if the resulting policy did not exceed the success threshold
            print_cbt(
                f"Randomly sampled the next candidate: {cands[i].numpy()}",
                "g")
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subrtn.item(),
                self.max_subrtn_rep)(self.train_policy_sim)
            wrapped_trn_fcn(cands[i], prefix=f"init_{i}")

        # Save candidates into a single tensor (policy is saved during training or exists already)
        pyrado.save(cands, "candidates.pt", self.save_dir)
        self.cands = cands
Пример #3
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        pyrado.save(self.policy, "policy.pt", self.save_dir)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self.env_real, "env.pkl", self.save_dir)
Пример #4
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        pyrado.save(self._expl_strat.policy, 'policy', 'pt', self.save_dir, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
Пример #5
0
    def eval_policy(
        save_dir: Optional[pyrado.PathLike],
        env: Env,
        policy: Policy,
        prefix: str,
        num_rollouts: int,
        num_workers: int = 1,
    ) -> to.Tensor:
        """
        Evaluate a policy either in the source or in the target domain.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_workers: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f"Executing {prefix}_policy ...", "c", bright=True)

        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when evaluating on a real-world device
            rets_real = []
            for i in range(num_rollouts):
                rets_real.append(
                    rollout(env, policy, eval=True).undiscounted_return())

        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when evaluating in a simulation
            sampler = ParallelRolloutSampler(env,
                                             policy,
                                             num_workers=num_workers,
                                             min_rollouts=num_rollouts)
            ros = sampler.sample(eval=True)
            rets_real = [ro.undiscounted_return() for ro in ros]
        else:
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv])

        rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype())

        if save_dir is not None:
            # Save and print the evaluation results
            pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix)
            print_cbt("Target domain performance", bright=True)
            print(
                tabulate([
                    ["mean return", to.mean(rets_real).item()],
                    ["std return", to.std(rets_real)],
                    ["min return", to.min(rets_real)],
                    ["max return", to.max(rets_real)],
                ]))

        return to.mean(rets_real)
Пример #6
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        for idx, p in enumerate(self.particles):
            pyrado.save(p, f'particle_{idx}', 'pt', self.save_dir, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
Пример #7
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subrtn of another algorithm
            pyrado.save(self.env, "env.pkl", self.save_dir)
            self.svpg.save_snapshot(meta_info)
        else:
            raise pyrado.ValueErr(
                msg=f"{self.name} is not supposed be run as a subrtn!")
Пример #8
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # The subroutines are saving their snapshots during their training
        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env_sim, "env_sim.pkl", self._save_dir)
        else:
            raise pyrado.ValueErr(
                msg=f"{self.name} is not supposed be run as a subrtn!")
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # Save the best element of the current population
        best_policy = deepcopy(self._policy)
        best_policy.param_values = self.best_policy_param
        pyrado.save(best_policy, 'policy', 'pt', self.save_dir, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
Пример #10
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # Policies of every iteration are saved by the subroutine in train_policy_sim()
        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            joblib.dump(self._env_sim, osp.join(self.save_dir, 'env_sim.pkl'))
            joblib.dump(self._env_real, osp.join(self.save_dir, 'env_real.pkl'))
            pyrado.save(self.policy, 'policy', 'pt', self.save_dir, None)
        else:
            raise pyrado.ValueErr(msg=f'{self.name} is not supposed be run as a subroutine!')
Пример #11
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self.env_dr, 'env', 'pkl', self.save_dir, meta_info)
            pyrado.save(self.env_dr.randomizer, 'randomizer', 'pkl',
                        self.save_dir, meta_info)
        else:
            raise pyrado.ValueErr(
                msg=f'{self.name} is not supposed be run as a subroutine!')
Пример #12
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            if self._subrtn_policy is None:
                # The policy is not being updated by a policy optimization subroutine
                pyrado.save(self._policy,
                            "policy.pt",
                            self.save_dir,
                            use_state_dict=True)
            else:
                self._subrtn_policy.save_snapshot()

        else:
            raise pyrado.ValueErr(
                msg=f"{self.name} is not supposed be run as a subroutine!")
Пример #13
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, "env.pkl", self.save_dir)
            pyrado.save(self._expl_strat.policy, "policy.pt", self.save_dir, use_state_dict=True)
            pyrado.save(self._critic.vfcn, "vfcn.pt", self.save_dir, use_state_dict=True)

        else:
            # This algorithm instance is a subroutine of another algorithm
            prefix = meta_info.get("prefix", "")
            suffix = meta_info.get("suffix", "")
            pyrado.save(
                self._expl_strat.policy, "policy.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True
            )
            pyrado.save(self._critic.vfcn, "vfcn.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True)
Пример #14
0
    def train_policy_sim(self, cand: to.Tensor, prefix: str,
                         cnt_rep: int) -> float:
        """
        Train a policy in simulation for given hyper-parameters from the domain randomizer.

        :param cand: hyper-parameters for the domain parameter distribution (need be compatible with the randomizer)
        :param prefix: set a prefix to the saved file name, use "" for no prefix
        :param cnt_rep: current repetition count, coming from the wrapper function
        :return: estimated return of the trained policy in the target domain
        """
        # Save the current candidate
        pyrado.save(cand.view(-1),
                    "candidate.pt",
                    self.save_dir,
                    prefix=prefix)

        # Set the domain randomizer
        self._env_sim.adapt_randomizer(cand.detach().cpu().numpy())

        # Reset the subroutine algorithm which includes resetting the exploration
        self._cnt_samples += self._subrtn_policy.sample_count
        self._subrtn_policy.reset()

        # Do a warm start if desired, but randomly reset the policy parameters if training failed once
        self._subrtn_policy.init_modules(
            self.warmstart and cnt_rep == 0,
            policy_param_init=self.policy_param_init,
            valuefcn_param_init=self.valuefcn_param_init,
        )

        # Train a policy in simulation using the subroutine
        self._subrtn_policy.train(snapshot_mode=self.subrtn_snapshot_mode,
                                  meta_info=dict(prefix=prefix))

        # Return the estimated return of the trained policy in simulation
        ros = self.eval_behav_policy(None, self._env_sim,
                                     self._subrtn_policy.policy, prefix,
                                     self.num_eval_rollouts)
        avg_ret_sim = to.mean(to.tensor([r.undiscounted_return()
                                         for r in ros]))
        return float(avg_ret_sim)
Пример #15
0
    def save_snapshot(self, meta_info: Optional[dict] = None):
        super().save_snapshot(meta_info)

        # Save the best element of the current population
        best_policy = deepcopy(self._policy)
        best_policy.param_values = self.best_policy_param

        if meta_info is not None:
            # This algorithm instance is a subroutine of another alogrithm
            pyrado.save(
                best_policy,
                "policy.pt",
                self.save_dir,
                prefix=meta_info.get("prefix", ""),
                suffix=meta_info.get("suffix", ""),
                use_state_dict=True,
            )

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(best_policy,
                        "policy.pt",
                        self.save_dir,
                        use_state_dict=True)
            pyrado.save(self._env, "env.pkl", self.save_dir)
Пример #16
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # ParameterExploring subroutine saves the best policy (in this case a DomainDistrParamPolicy)
        prefix = meta_info.get("prefix", "")
        if prefix != "":
            self._subrtn.save_snapshot(meta_info=dict(
                prefix=f"{prefix}_ddp"))  # save iter_X_ddp_policy.pt
        self._subrtn.save_snapshot(
            meta_info=dict(prefix="ddp"))  # override ddp_policy.pt

        joblib.dump(self._subrtn.env, osp.join(self.save_dir, "env_sim.pkl"))

        # Print the current search distribution's mean
        cpp = self._subrtn.policy.transform_to_ddp_space(
            self._subrtn.policy.param_values)
        self._subrtn.env.adapt_randomizer(
            domain_distr_param_values=cpp.detach().cpu().numpy())
        print_cbt(
            f"Current policy domain parameter distribution\n{self._subrtn.env.randomizer}",
            "g")

        # Set the randomizer to best fitted domain distribution
        cbp = self._subrtn.policy.transform_to_ddp_space(
            self._subrtn.best_policy_param)
        self._subrtn.env.adapt_randomizer(
            domain_distr_param_values=cbp.detach().cpu().numpy())
        print_cbt(
            f"Best fitted domain parameter distribution\n{self._subrtn.env.randomizer}",
            "g")

        if "rollouts_real" not in meta_info:
            raise pyrado.KeyErr(keys="rollouts_real", container=meta_info)
        pyrado.save(meta_info["rollouts_real"],
                    "rollouts_real.pkl",
                    self.save_dir,
                    prefix=prefix)
Пример #17
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self.qfcn_targ_1, "qfcn_target1.pt", self.save_dir, use_state_dict=True)
            pyrado.save(self.qfcn_targ_2, "qfcn_target2.pt", self.save_dir, use_state_dict=True)
        else:
            # This algorithm instance is a subroutine of another algorithm
            prefix = meta_info.get("prefix", "")
            suffix = meta_info.get("suffix", "")
            pyrado.save(
                self.qfcn_targ_1, "qfcn_target1.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True
            )
            pyrado.save(
                self.qfcn_targ_2, "qfcn_target2.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True
            )
Пример #18
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # Policies of every iteration are saved by the subroutine in train_policy_sim()
        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env_sim, "env_sim.pkl", self._save_dir)
            pyrado.save(self._env_real, "env_real.pkl", self._save_dir)
            pyrado.save(self.policy,
                        "policy.pt",
                        self.save_dir,
                        use_state_dict=True)
        else:
            raise pyrado.ValueErr(
                msg=f"{self.name} is not supposed be run as a subroutine!")
Пример #19
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot()

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._policy, "policy.pt", self.save_dir, use_state_dict=False)
            pyrado.save(self.dataset, "dataset.pt", self.save_dir)
        else:
            # This algorithm instance is a subroutine of another algorithm
            pyrado.save(
                self._policy,
                "policy.pt",
                self.save_dir,
                prefix=meta_info.get("prefix", ""),
                suffix=meta_info.get("suffix", ""),
                use_state_dict=True,
            )
            pyrado.save(
                self.dataset,
                "dataset.pt",
                self.save_dir,
                prefix=meta_info.get("prefix", ""),
                suffix=meta_info.get("suffix", ""),
            )
Пример #20
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, "env.pkl", self.save_dir)
            for idx, p in enumerate(self.particles):
                pyrado.save(p, f"particle_{idx}.pt", self.save_dir, use_state_dict=True)
        else:
            # This algorithm instance is a subroutine of another algorithm
            for idx, p in enumerate(self.particles):
                pyrado.save(
                    p,
                    f"particle_{idx}.pt",
                    self.save_dir,
                    prefix=meta_info.get("prefix", ""),
                    suffix=meta_info.get("suffix", ""),
                    use_state_dict=True,
                )
Пример #21
0
    def __init__(self,
                 save_dir: str,
                 env_sim: MetaDomainRandWrapper,
                 env_real: [RealEnv, EnvWrapper],
                 subrtn: Algorithm,
                 ddp_space: BoxSpace,
                 max_iter: int,
                 acq_fc: str,
                 acq_restarts: int,
                 acq_samples: int,
                 acq_param: dict = None,
                 num_init_cand: int = 5,
                 mc_estimator: bool = True,
                 num_eval_rollouts_real: int = 5,
                 num_eval_rollouts_sim: int = 50,
                 thold_succ: float = pyrado.inf,
                 thold_succ_subrtn: float = -pyrado.inf,
                 warmstart: bool = True,
                 policy_param_init: Optional[to.Tensor] = None,
                 valuefcn_param_init: Optional[to.Tensor] = None,
                 subrtn_snapshot_mode: str = 'best',
                 logger: Optional[StepLogger] = None):
        """
        Constructor

        .. note::
            If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to
            initialize every of the policies with a pre-trained policy parameters use `policy_param_init`.

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param ddp_space: space holding the boundaries for the domain distribution parameters
        :param max_iter: maximum number of iterations
        :param acq_fc: Acquisition Function
                       'UCB': Upper Confidence Bound (default $\beta = 0.1$)
                       'EI': Expected Improvement
                       'PI': Probability of Improvement
        :param acq_restarts: number of restarts for optimizing the acquisition function
        :param acq_samples: number of initial samples for optimizing the acquisition function
        :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB
        :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootstrapping
        :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return
        :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training
        :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded
        :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the
                                      subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy parameters with the one of the previous iteration. This option has no
                          effect for initial policies and can be overruled by passing init policy params explicitly.
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if typed_env(env_sim, MetaDomainRandWrapper) is None:
            raise pyrado.TypeErr(given=env_sim, expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn, Algorithm):
            raise pyrado.TypeErr(given=subrtn, expected_type=Algorithm)
        if not isinstance(ddp_space, BoxSpace):
            raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace)
        if num_init_cand < 1:
            raise pyrado.ValueErr(given=num_init_cand, ge_constraint='1')

        # Call InterruptableAlgorithm's constructor without specifying the policy
        super().__init__(num_checkpoints=2, init_checkpoint=-2, save_dir=save_dir, max_iter=max_iter,
                         policy=subrtn.policy, logger=logger)

        self._env_sim = env_sim
        self._env_real = env_real
        self._subrtn = subrtn
        self._subrtn.save_name = 'subrtn'
        self.ddp_space = ddp_space
        self.ddp_projector = UnitCubeProjector(to.from_numpy(self.ddp_space.bound_lo),
                                               to.from_numpy(self.ddp_space.bound_up))
        self.cands = None  # called x in the context of GPs
        self.cands_values = None  # called y in the context of GPs
        self.argmax_cand = to.Tensor()
        self.acq_fcn_type = acq_fc.upper()
        self.acq_restarts = acq_restarts
        self.acq_samples = acq_samples
        self.acq_param = acq_param
        self.num_init_cand = num_init_cand
        self.mc_estimator = mc_estimator
        self.policy_param_init = policy_param_init
        self.valuefcn_param_init = valuefcn_param_init.detach() if valuefcn_param_init is not None else None
        self.warmstart = warmstart
        self.num_eval_rollouts_real = num_eval_rollouts_real
        self.num_eval_rollouts_sim = num_eval_rollouts_sim
        self.subrtn_snapshot_mode = subrtn_snapshot_mode
        self.thold_succ = to.tensor([thold_succ])
        self.thold_succ_subrtn = to.tensor([thold_succ_subrtn])
        self.max_subrtn_rep = 3  # number of tries to exceed thold_succ_subrtn during training in simulation
        self.curr_cand_value = -pyrado.inf  # for the stopping criterion

        if self.policy_param_init is not None:
            if to.is_tensor(self.policy_param_init):
                self.policy_param_init.detach()
            else:
                self.policy_param_init = to.tensor(self.policy_param_init)

        # Save initial environments and the domain distribution parameter space
        self.save_snapshot(meta_info=None)
        pyrado.save(self.ddp_space, 'ddp_space', 'pkl', self.save_dir)
Пример #22
0
def evaluate_policy(args, ex_dir):
    """Helper function to evaluate the policy from an experiment in the associated environment."""
    env, policy, _ = load_experiment(ex_dir, args)

    # Create multi-dim evaluation grid
    param_spec = dict()
    param_spec_dim = None

    if isinstance(inner_env(env), BallOnPlateSim):
        param_spec["ball_radius"] = np.linspace(0.02, 0.08, num=2, endpoint=True)
        param_spec["ball_rolling_friction_coefficient"] = np.linspace(0.0295, 0.9, num=2, endpoint=True)

    elif isinstance(inner_env(env), QQubeSwingUpSim):
        eval_num = 200
        # Use nominal values for all other parameters.
        for param, nominal_value in env.get_nominal_domain_param().items():
            param_spec[param] = nominal_value
        # param_spec["gravity_const"] = np.linspace(5.0, 15.0, num=eval_num, endpoint=True)
        param_spec["damping_pend_pole"] = np.linspace(0.0, 0.0001, num=eval_num, endpoint=True)
        param_spec["damping_rot_pole"] = np.linspace(0.0, 0.0006, num=eval_num, endpoint=True)
        param_spec_dim = 2

    elif isinstance(inner_env(env), QBallBalancerSim):
        # param_spec["gravity_const"] = np.linspace(7.91, 11.91, num=11, endpoint=True)
        # param_spec["ball_mass"] = np.linspace(0.003, 0.3, num=11, endpoint=True)
        # param_spec["ball_radius"] = np.linspace(0.01, 0.1, num=11, endpoint=True)
        param_spec["plate_length"] = np.linspace(0.275, 0.275, num=11, endpoint=True)
        param_spec["arm_radius"] = np.linspace(0.0254, 0.0254, num=11, endpoint=True)
        # param_spec["load_inertia"] = np.linspace(5.2822e-5*0.5, 5.2822e-5*1.5, num=11, endpoint=True)
        # param_spec["motor_inertia"] = np.linspace(4.6063e-7*0.5, 4.6063e-7*1.5, num=11, endpoint=True)
        # param_spec["gear_ratio"] = np.linspace(60, 80, num=11, endpoint=True)
        # param_spec["gear_efficiency"] = np.linspace(0.6, 1.0, num=11, endpoint=True)
        # param_spec["motor_efficiency"] = np.linspace(0.49, 0.89, num=11, endpoint=True)
        # param_spec["motor_back_emf"] = np.linspace(0.006, 0.066, num=11, endpoint=True)
        # param_spec["motor_resistance"] = np.linspace(2.6*0.5, 2.6*1.5, num=11, endpoint=True)
        # param_spec["combined_damping"] = np.linspace(0.0, 0.05, num=11, endpoint=True)
        # param_spec["friction_coeff"] = np.linspace(0, 0.015, num=11, endpoint=True)
        # param_spec["voltage_thold_x_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True)
        # param_spec["voltage_thold_x_neg"] = np.linspace(-1., 0.0, num=11, endpoint=True)
        # param_spec["voltage_thold_y_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True)
        # param_spec["voltage_thold_y_neg"] = np.linspace(-1.0, 0, num=11, endpoint=True)
        # param_spec["offset_th_x"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True)
        # param_spec["offset_th_y"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True)

    else:
        raise NotImplementedError

    # Always add an action delay wrapper (with 0 delay by default)
    if typed_env(env, ActDelayWrapper) is None:
        env = ActDelayWrapper(env)
    # param_spec['act_delay'] = np.linspace(0, 30, num=11, endpoint=True, dtype=int)

    add_info = "-".join(param_spec.keys())

    # Create multidimensional results grid and ensure right number of rollouts
    param_list = param_grid(param_spec)
    param_list *= args.num_rollouts_per_config

    # Fix initial state (set to None if it should not be fixed)
    init_state = np.array([0.0, 0.0, 0.0, 0.0])

    # Create sampler
    pool = SamplerPool(args.num_workers)
    if args.seed is not None:
        pool.set_seed(args.seed)
        print_cbt(f"Set the random number generators' seed to {args.seed}.", "w")
    else:
        print_cbt("No seed was set", "y")

    # Sample rollouts
    ros = eval_domain_params(pool, env, policy, param_list, init_state)

    # Compute metrics
    lod = []
    for ro in ros:
        d = dict(**ro.rollout_info["domain_param"], ret=ro.undiscounted_return(), len=ro.length)
        # Simply remove the observation noise from the domain parameters
        try:
            d.pop("obs_noise_mean")
            d.pop("obs_noise_std")
        except KeyError:
            pass
        lod.append(d)

    df = pd.DataFrame(lod)
    metrics = dict(
        avg_len=df["len"].mean(),
        avg_ret=df["ret"].mean(),
        median_ret=df["ret"].median(),
        min_ret=df["ret"].min(),
        max_ret=df["ret"].max(),
        std_ret=df["ret"].std(),
    )
    pprint(metrics, indent=4)

    # Create subfolder and save
    timestamp = datetime.datetime.now()
    add_info = timestamp.strftime(pyrado.timestamp_format) + "--" + add_info
    save_dir = osp.join(ex_dir, "eval_domain_grid", add_info)
    os.makedirs(save_dir, exist_ok=True)

    save_dicts_to_yaml(
        {"ex_dir": str(ex_dir)},
        {"varied_params": list(param_spec.keys())},
        {"num_rpp": args.num_rollouts_per_config, "seed": args.seed},
        {"metrics": dict_arraylike_to_float(metrics)},
        save_dir=save_dir,
        file_name="summary",
    )
    pyrado.save(df, f"df_sp_grid_{len(param_spec) if param_spec_dim is None else param_spec_dim}d.pkl", save_dir)
Пример #23
0
    def step(self, snapshot_mode: str = 'latest', meta_info: dict = None):
        # Save snapshot to save the correct iteration count
        self.save_snapshot()

        if self.curr_checkpoint == -2:
            # Train the initial policies in the source domain
            self.train_init_policies()
            self.reached_checkpoint()  # setting counter to -1

        if self.curr_checkpoint == -1:
            # Evaluate the initial policies in the target domain
            self.eval_init_policies()
            self.reached_checkpoint()  # setting counter to 0

        if self.curr_checkpoint == 0:
            # Normalize the input data and standardize the output data
            cands_norm = self.ddp_projector.project_to(self.cands)
            cands_values_stdized = standardize(self.cands_values).unsqueeze(1)

            # Create and fit the GP model
            gp = SingleTaskGP(cands_norm, cands_values_stdized)
            gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5))
            mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
            fit_gpytorch_model(mll)
            print_cbt('Fitted the GP.', 'g')

            # Acquisition functions
            if self.acq_fcn_type == 'UCB':
                acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True)
            elif self.acq_fcn_type == 'EI':
                acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            elif self.acq_fcn_type == 'PI':
                acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            else:
                raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'")

            # Optimize acquisition function and get new candidate point
            cand_norm, acq_value = optimize_acqf(
                acq_function=acq_fcn,
                bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]),
                q=1,
                num_restarts=self.acq_restarts,
                raw_samples=self.acq_samples
            )
            next_cand = self.ddp_projector.project_back(cand_norm)
            print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g')
            self.cands = to.cat([self.cands, next_cand], dim=0)
            pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 1

        if self.curr_checkpoint == 1:
            # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subrtn.item(), self.max_subrtn_rep
            )(self.train_policy_sim)
            wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}')
            self.reached_checkpoint()  # setting counter to 2

        if self.curr_checkpoint == 2:
            # Evaluate the current policy in the target domain
            policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir,
                                        meta_info=dict(prefix=f'iter_{self._curr_iter}'))
            self.curr_cand_value = self.eval_policy(
                self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}',
                self.num_eval_rollouts_real
            )
            self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0)
            pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info)

            # Store the argmax after training and evaluating
            curr_argmax_cand = BayRn.argmax_posterior_mean(
                self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples
            )
            self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0)
            pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 0
if __name__ == "__main__":
    # Parse command line arguments
    args = get_argparser().parse_args()
    if not osp.isfile(args.file):
        raise pyrado.PathErr(given=args.file)
    if args.dir is None:
        # Use the file's directory by default
        args.dir = osp.dirname(args.file)
    elif not osp.isdir(args.dir):
        raise pyrado.PathErr(given=args.dir)

    df = pd.read_csv(args.file)

    if args.env_name == MiniGolfIKSim.name:
        env = MiniGolfIKSim()
    elif args.env_name == MiniGolfJointCtrlSim.name:
        env = MiniGolfJointCtrlSim()
    else:
        raise NotImplementedError

    # Cast the rollout from a DataFrame to a StepSequence
    reconstructed = StepSequence.from_pandas(df, env.spec, task=env.task)

    if args.dir is not None:
        suffix = args.file[args.file.rfind("/") + 1:-4]
        pyrado.save(reconstructed,
                    f"rollout_{suffix}.pkl",
                    args.dir,
                    verbose=True)
Пример #25
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env_sim: SimEnv,
        env_real: Union[Env, str],
        policy: Policy,
        dp_mapping: Mapping[int, str],
        prior: Distribution,
        embedding: Embedding,
        num_checkpoints: int,
        init_checkpoint: int,
        max_iter: int,
        num_real_rollouts: int,
        num_sim_per_round: int,
        num_segments: int = None,
        len_segments: int = None,
        stop_on_done: bool = True,
        use_rec_act: bool = True,
        num_sbi_rounds: int = 1,
        reset_sbi_routine_each_iter: bool = False,
        reset_proposal_each_iter: bool = False,
        num_eval_samples: Optional[int] = None,
        posterior_hparam: Optional[dict] = None,
        subrtn_sbi_training_hparam: Optional[dict] = None,
        subrtn_sbi_sampling_hparam: Optional[dict] = None,
        simulation_batch_size: int = 1,
        normalize_posterior: bool = True,
        subrtn_policy: Optional[Algorithm] = None,
        subrtn_policy_snapshot_mode: str = "latest",
        train_initial_policy: bool = True,
        thold_succ_subrtn: float = -pyrado.inf,
        warmstart: bool = True,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain, this can be a `RealEnv` (sim-to-real setting), a
                         `SimEnv` (sim-to-sim setting), or a directory to load a pre-recorded set of rollouts from
        :param policy: policy used for sampling the rollout, if subrtn_policy is not `None` this policy is not oly used
                       for generating the target domain rollouts, but also optimized in simulation
        :param dp_mapping: mapping from subsequent integers (starting at 0) to domain parameter names (e.g. mass)
        :param prior: distribution used by sbi as a prior
        :param embedding: embedding used for pre-processing the data before passing it to the posterior
        :param num_checkpoints: total number of checkpoints
        :param init_checkpoint: initial value of the cyclic counter, defaults to 0, use negative values can to mark
                                sections that should only be executed once
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_real_rollouts: number of real-world rollouts received by sbi, i.e. from every rollout exactly one
                                  data set is computed
        :param num_sim_per_round: number of simulations done by sbi per round (i.e. iteration over the same target domain data set)
        :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                            state of the simulation is reset, and thus for every set the features of the trajectories
                            are computed separately. Either specify `num_segments` or `len_segments`.
        :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param stop_on_done: if `True`, the rollouts are stopped as soon as they hit the state or observation space
                             boundaries. This behavior is save, but can lead to short trajectories which are eventually
                             padded with zeroes. Chose `False` to ignore the boundaries (dangerous on the real system).
        :param use_rec_act: if `True` the recorded actions form the target domain are used to generate the rollout
                            during simulation (feed-forward). If `False` there policy is used to generate (potentially)
                            state-dependent actions (feed-back).
        :param reset_sbi_routine_each_iter: if `True` the sbi subroutine instance is recreated every iteration.
                                            Use this flag to train the posterior each iteration from scratch.
        :param num_sbi_rounds: set to an integer > 1 to use multi-round sbi. This way the posteriors (saved as
                               `..._round_NUMBER...` will be tailored to the data of that round, where `NUMBER`
                               counts up each round (modulo `num_real_rollouts`). If `num_sbi_rounds` = 1, the posterior
                               is called amortized (it has never seen any target domain data).
        :param num_eval_samples: number of samples for evaluating the posterior in `eval_posterior()`
        :param posterior_hparam: hyper parameters for creating the posterior's density estimator
        :param subrtn_sbi_training_hparam: dict forwarded to sbi's `PosteriorEstimator.train()` function like
                                           `training_batch_size`, `learning_rate`, `retrain_from_scratch_each_round`, ect.
        :param simulation_batch_size: batch size forwarded to the sbi toolbox, requires batched simulator
        :param normalize_posterior: if `True` the normalization of the posterior density is enforced by sbi
        :param subrtn_policy: algorithm which performs the optimization of the behavioral policy (and value-function)
        :param subrtn_policy_snapshot_mode: snapshot mode for saving during policy optimization
        :param train_initial_policy: choose if a policy should be pretrained in the first iteration before collecting
                                     real rollouts. Choose `False`, if you want to use a pre-defined policy.
        :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the
                                  subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy (and value function) parameters with the one of the previous iteration.
                          This behavior can also be overruled by passing `init_policy_params` (and
                          `valuefcn_param_init`) explicitly.
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(inner_env(env_sim), SimEnv) or (
                isinstance(env_sim, DomainRandWrapper)
                and not isinstance(env_sim, ActDelayWrapper)):
            raise pyrado.TypeErr(
                msg=
                "The given env_sim must be a non-randomized simulation environment, "
                "except for wrappers that add a domain parameter!")
        if isinstance(prior, Normal):
            raise pyrado.TypeErr(
                msg=
                "The sbi framework requires MultivariateNormal instead of Normal distributions for the prior."
            )
        if not prior.event_shape[0] == len(dp_mapping):
            raise pyrado.ShapeErr(given=prior.event_shape,
                                  expected_match=dp_mapping)
        if posterior_hparam is None:
            posterior_hparam = dict()
        elif not isinstance(posterior_hparam, dict):
            raise pyrado.TypeErr(given=posterior_hparam, expected_type=dict)
        if subrtn_sbi_training_hparam is None:
            subrtn_sbi_training_hparam = dict()
        elif not isinstance(subrtn_sbi_training_hparam, dict):
            raise pyrado.TypeErr(given=subrtn_sbi_training_hparam,
                                 expected_type=dict)

        # Call InterruptableAlgorithm's constructor
        super().__init__(
            num_checkpoints=num_checkpoints,
            init_checkpoint=init_checkpoint,
            save_dir=save_dir,
            max_iter=max_iter,
            policy=policy,
            logger=logger,
        )

        self._env_sim_sbi = env_sim  # will be randomized explicitly by sbi
        self._env_sim_trn = DomainRandWrapperBuffer(copy.deepcopy(env_sim),
                                                    randomizer=None,
                                                    selection="cyclic")
        self._env_real = env_real
        self.dp_mapping = dp_mapping
        self._embedding = embedding
        self.num_sim_per_round = num_sim_per_round
        self.num_real_rollouts = num_real_rollouts
        self.num_segments = num_segments
        self.len_segments = len_segments
        self.stop_on_done = stop_on_done
        self.use_rec_act = use_rec_act
        self.reset_sbi_routine_each_iter = reset_sbi_routine_each_iter
        self.reset_proposal_each_iter = reset_proposal_each_iter
        self.num_sbi_rounds = num_sbi_rounds
        self.num_eval_samples = num_eval_samples or 10 * 2**len(dp_mapping)
        self.simulation_batch_size = simulation_batch_size
        self.normalize_posterior = normalize_posterior
        self._subrtn_sbi = None
        self.subrtn_sbi_training_hparam = subrtn_sbi_training_hparam or dict()
        self.posterior_hparam = posterior_hparam or dict()
        self.thold_succ_subrtn = float(thold_succ_subrtn)
        self.max_subrtn_rep = 3  # number of tries to exceed thold_succ_subrtn during training in simulation
        self.warmstart = warmstart
        self.num_workers = int(num_workers)

        # Temporary containers
        self._curr_data_real = None
        self._curr_domain_param_eval = None

        # Initialize sbi simulator and prior
        self._sbi_simulator = None  # to be set in step()
        self._sbi_prior = None  # to be set in step()
        self._setup_sbi(prior=prior)

        # Optional policy optimization subroutine
        self._subrtn_policy = subrtn_policy
        if isinstance(self._subrtn_policy, Algorithm):
            self._subrtn_policy_snapshot_mode = subrtn_policy_snapshot_mode
            self._subrtn_policy.save_name = "subrtn_policy"
            self._train_initial_policy = train_initial_policy
            # Check that the behavioral policy is the one that is being updated
            if self._subrtn_policy.policy is not self.policy:
                raise pyrado.ValueErr(
                    msg=
                    "The policy is the policy subroutine is not the same as the one used by "
                    "the system identification (sbi) subroutine!")

        # Save initial environments, the embedding, and the prior
        pyrado.save(self._env_sim_trn, "env_sim.pkl", self._save_dir)
        pyrado.save(self._env_real, "env_real.pkl", self._save_dir)
        pyrado.save(embedding, "embedding.pt", self._save_dir)
        pyrado.save(prior, "prior.pt", self._save_dir)
        pyrado.save(policy,
                    "init_policy.pt",
                    self._save_dir,
                    use_state_dict=True)
Пример #26
0
    def step(self,
             snapshot_mode: str,
             meta_info: dict = None,
             parallel: bool = True):
        rand_trajs = []
        ref_trajs = []
        ros = []
        visited = []
        for i in range(self.svpg.num_particles):
            done = False
            svpg_env = self.svpg_wrapper
            state = svpg_env.reset()
            states = []
            actions = []
            rewards = []
            infos = []
            rand_trajs_now = []
            if parallel:
                with to.no_grad():
                    for t in range(10):
                        action = (self.svpg.expl_strats[i](to.as_tensor(
                            state, dtype=to.get_default_dtype())).detach().cpu(
                            ).numpy())
                        state = svpg_env.lite_step(action)
                        states.append(state)
                        actions.append(action)
                    visited.append(states)
                    rewards, rand_trajs_now, ref_trajs_now = svpg_env.eval_states(
                        states)
                    rand_trajs += rand_trajs_now
                    ref_trajs += ref_trajs_now
                    ros.append(
                        StepSequence(observations=states,
                                     actions=actions,
                                     rewards=rewards))
            else:
                with to.no_grad():
                    while not done:
                        action = (self.svpg.expl_strats[i](to.as_tensor(
                            state, dtype=to.get_default_dtype())).detach().cpu(
                            ).numpy())
                        state, reward, done, info = svpg_env.step(action)
                        print(self.params.array_to_dict(state), " => ", reward)
                        states.append(state)
                        rewards.append(reward)
                        actions.append(action)
                        infos.append(info)
                        rand_trajs += info["rand"]
                        ref_trajs += info["ref"]
                    ros.append(
                        StepSequence(observations=states,
                                     actions=actions,
                                     rewards=rewards))
            self.logger.add_value(f"SVPG_agent_{i}_mean_reward",
                                  np.mean(rewards))
            ros[i].torch(data_type=to.DoubleTensor)
            for rt in rand_trajs_now:
                rt.torch(data_type=to.double)
                rt.observations = rt.observations.double().detach()
                rt.actions = rt.actions.double().detach()
            self._subrtn.update(rand_trajs_now)

        # Logging
        rets = [ro.undiscounted_return() for ro in rand_trajs]
        ret_avg = np.mean(rets)
        ret_med = np.median(rets)
        ret_std = np.std(rets)
        self.logger.add_value("avg rollout len",
                              np.mean([ro.length for ro in rand_trajs]))
        self.logger.add_value("avg return", ret_avg)
        self.logger.add_value("median return", ret_med)
        self.logger.add_value("std return", ret_std)

        # Flatten and combine all randomized and reference trajectories for discriminator
        flattened_randomized = StepSequence.concat(rand_trajs)
        flattened_randomized.torch(data_type=to.double)
        flattened_reference = StepSequence.concat(ref_trajs)
        flattened_reference.torch(data_type=to.double)
        self.reward_generator.train(flattened_reference, flattened_randomized,
                                    self.num_discriminator_epoch)
        pyrado.save(self.reward_generator.discriminator,
                    "discriminator.pt",
                    self.save_dir,
                    prefix="adr",
                    use_state_dict=True)

        if self.curr_time_step > self.warm_up_time:
            # Update the particles
            # List of lists to comply with interface
            self.svpg.update(list(map(lambda x: [x], ros)))
        flattened_randomized.torch(data_type=to.double)
        flattened_randomized.observations = flattened_randomized.observations.double(
        ).detach()
        flattened_randomized.actions = flattened_randomized.actions.double(
        ).detach()

        # np.save(f'{self.save_dir}actions{self.curr_iter}', flattened_randomized.actions)
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)
        self._subrtn.make_snapshot(snapshot_mode="best",
                                   curr_avg_ret=float(ret_avg))
        self.curr_time_step += 1
Пример #27
0
    def collect_data_real(
        save_dir: Optional[pyrado.PathLike],
        env: Union[Env, str],
        policy: Policy,
        embedding: Embedding,
        num_rollouts: int,
        num_segments: int = None,
        len_segments: int = None,
        prefix: str = "",
    ) -> Tuple[to.Tensor, List[StepSequence]]:
        """
        Roll-out a (behavioral) policy on the target system for later use with the sbi module, and save the data
        computed from the recorded rollouts.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance,
                    in case you want to use pre-recorded rollouts pass the path to the parent folder as string
        :param policy: policy to evaluate
        :param embedding: embedding used for pre-processing the data before passing it to the posterior
        :param num_rollouts: number of rollouts to collect on the target system
        :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :return: data from the real-world rollouts a.k.a. set of $x_o$ of shape [num_iter, num_rollouts_per_iter,
                 time_series_length, dim_data], and the real-world rollouts
        """
        if not (isinstance(inner_env(env), RealEnv)
                or isinstance(inner_env(env), SimEnv) or isinstance(env, str)):
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv, str])

        # Evaluate sequentially (necessary for sim-to-real experiments)
        if isinstance(env, str):
            rollout_worker = RecRolloutSamplerForSBI(env,
                                                     embedding,
                                                     num_segments,
                                                     len_segments,
                                                     rand_init_rollout=False)
        else:
            rollout_worker = RealRolloutSamplerForSBI(env, policy, embedding,
                                                      num_segments,
                                                      len_segments)

        # Initialize data containers
        data_real = None
        rollouts_real = None
        num_found_rollouts = 0
        if save_dir is not None:
            try:
                data_real = pyrado.load("data_real.pt",
                                        save_dir,
                                        prefix=prefix)
                rollouts_real = pyrado.load("rollouts_real.pkl",
                                            save_dir,
                                            prefix=prefix)
                if not data_real.shape[0] == len(rollouts_real):
                    raise pyrado.ShapeErr(
                        msg=
                        f"Found {data_real.shape[0]} entries in data_real.pt, but {len(rollouts_real)} rollouts in "
                        f"rollouts_real.pkl!")
                num_found_rollouts = len(rollouts_real)
                print_cbt(
                    f"Found {num_found_rollouts} rollout(s) in {save_dir}.",
                    "w")
            except FileNotFoundError:
                pass  # in the first attempt no files can be found

        collect_str = f"Collecting data" if prefix == "" else f"Collecting data using {prefix}_policy"
        for _ in tqdm(
                range(num_found_rollouts, num_rollouts),
                total=num_rollouts,
                desc=Fore.CYAN + Style.BRIGHT + collect_str + Style.RESET_ALL,
                unit="rollouts",
                file=sys.stdout,
        ):
            # Do the rollout
            data, rollout = rollout_worker()

            # Fill data container
            if data_real is None or rollouts_real is None:
                data_real = data  # data is of shape [1, dim_feat]
                rollouts_real = [rollout]
            else:
                data_real = to.cat(
                    [data_real, data],
                    dim=1)  # stack to final shape [1, num_rollouts * dim_feat]
                rollouts_real.append(rollout)

            # Optionally save the data (do this at every iteration to continue)
            if save_dir is not None:
                pyrado.save(data_real, "data_real.pt", save_dir, prefix=prefix)
                pyrado.save(rollouts_real,
                            "rollouts_real.pkl",
                            save_dir,
                            prefix=prefix)

        if data_real.shape != (1, num_rollouts * embedding.dim_output):
            raise pyrado.ShapeErr(given=data_real,
                                  expected_match=(1, num_rollouts *
                                                  embedding.dim_output))

        return data_real, rollouts_real
Пример #28
0
    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        pyrado.save(self.qfcn_targ, 'qfcn_target', 'pt', self.save_dir,
                    meta_info)
Пример #29
0
    def eval_policy(
        save_dir: Optional[pyrado.PathLike],
        env: [RealEnv, SimEnv, MetaDomainRandWrapper],
        policy: Policy,
        mc_estimator: bool,
        prefix: str,
        num_rollouts: int,
        num_workers: int = 4,
    ) -> to.Tensor:
        """
        Evaluate a policy either in the source or in the target domain.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_workers: number of environments for the parallel sampler (only used for a `SimEnv`)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f"Executing {prefix}_policy ...", "c", bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env, policy,
                                       eval=True).undiscounted_return()
                # If a reward of -1 is given, skip evaluation ahead and set all returns to zero
                if rets_real[i] == -1:
                    print_cbt("Set all returns for this policy to zero.",
                              color="c")
                    rets_real = to.zeros(num_rollouts)
                    break
        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelRolloutSampler(env,
                                             policy,
                                             num_workers=num_workers,
                                             min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv])

        if save_dir is not None:
            # Save and print the evaluation results
            pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix)
            print_cbt("Target domain performance", bright=True)
            print(
                tabulate([
                    ["mean return", to.mean(rets_real).item()],
                    ["std return", to.std(rets_real)],
                    ["min return", to.min(rets_real)],
                    ["max return", to.max(rets_real)],
                ]))

        if mc_estimator:
            return to.mean(rets_real)
        else:
            _, ci_lo, _ = bootstrap_ci(rets_real.numpy(),
                                       np.mean,
                                       num_reps=1000,
                                       alpha=0.05,
                                       ci_sides=1,
                                       studentized=False)
            return to.from_numpy(ci_lo)
Пример #30
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: [SimEnv, StateAugmentationWrapper],
        subrtn: Algorithm,
        policy: Policy,
        expl_strat: StochasticActionExplStrat,
        max_iter: int,
        num_rollouts: int = None,
        steps_num: int = None,
        apply_dynamics_noise: bool = False,
        dyn_eps: float = 0.01,
        dyn_phi: float = 0.1,
        halfspan: float = 0.25,
        apply_proccess_noise: bool = False,
        proc_eps: float = 0.01,
        proc_phi: float = 0.05,
        apply_observation_noise: bool = False,
        obs_eps: float = 0.01,
        obs_phi: float = 0.05,
        torch_observation: bool = True,
        logger: StepLogger = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment in which the agent should be trained
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param policy: policy to be updated
        :param expl_strat: the exploration strategy
        :param max_iter: the maximum number of iterations
        :param num_rollouts: the number of rollouts to be performed for each update step
        :param steps_num: the number of steps to be performed for each update step
        :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied
        :param dyn_eps: the intensity of generated dynamics noise
        :param dyn_phi: the probability of applying dynamics noise
        :param halfspan: the halfspan of the uniform random distribution used to sample
        :param apply_proccess_noise: whether adversarially generated process noise should be applied
        :param proc_eps: the intensity of generated process noise
        :param proc_phi: the probability of applying process noise
        :param apply_observation_noise: whether adversarially generated observation noise should be applied
        :param obs_eps: the intensity of generated observation noise
        :param obs_phi: the probability of applying observation noise
        :param torch_observation: a function to provide a differentiable observation
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        assert isinstance(subrtn, Algorithm)
        assert isinstance(max_iter, int) and max_iter > 0

        super().__init__(save_dir, max_iter, policy, logger)

        # Initialize adversarial wrappers
        if apply_dynamics_noise:
            assert isinstance(env, StateAugmentationWrapper)
            env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps,
                                             dyn_phi, halfspan)
        if apply_proccess_noise:
            env = AdversarialStateWrapper(env,
                                          self.policy,
                                          proc_eps,
                                          proc_phi,
                                          torch_observation=torch_observation)
        if apply_observation_noise:
            env = AdversarialObservationWrapper(env, self.policy, obs_eps,
                                                obs_phi)
        self._env = env
        # TODO @Robin: how do you make sure that the newly wrapped env is used by the subroutine?

        # Subroutine
        self._subrtn = subrtn
        self._subrtn.save_name = "subrtn"

        pyrado.save(self._env, "env.pkl", self.save_dir)