示例#1
0
def plot_actions(ro: StepSequence, env: Env):
    """
    Plot all action trajectories of the given rollout.

    :param ro: input rollout
    :param env: environment (used for getting the clipped action values)
    """
    if hasattr(ro, 'actions'):
        if not isinstance(ro.actions, np.ndarray):
            raise pyrado.TypeErr(given=ro.actions, expected_type=np.ndarray)

        dim_act = ro.actions.shape[1]
        # Use recorded time stamps if possible
        t = ro.env_infos.get('t', np.arange(0, ro.length)) if hasattr(
            ro, 'env_infos') else np.arange(0, ro.length)

        fig, axs = plt.subplots(dim_act, figsize=(8, 12))
        fig.suptitle('Actions over Time')
        colors = plt.get_cmap('tab20')(np.linspace(0, 1, dim_act))

        act_norm_wrapper = typed_env(env, ActNormWrapper)
        if act_norm_wrapper is not None:
            lb, ub = inner_env(env).act_space.bounds
            act_denorm = lb + (ro.actions[:] + 1.) * (ub - lb) / 2
            act_clipped = np.array(
                [inner_env(env).limit_act(a) for a in act_denorm])
        else:
            act_denorm = ro.actions
            act_clipped = np.array([env.limit_act(a) for a in ro.actions[:]])

        if dim_act == 1:
            axs.plot(t, act_denorm, label=_get_act_label(ro, 0) + ' (to env)')
            axs.plot(t,
                     act_clipped,
                     label=_get_act_label(ro, 0) + ' (clipped)',
                     c='k',
                     ls='--')
            axs.legend(bbox_to_anchor=(0, 1.0, 1, -0.1),
                       loc='lower left',
                       mode='expand',
                       ncol=2)
        else:
            for i in range(dim_act):
                axs[i].plot(t,
                            act_denorm[:, i],
                            label=_get_act_label(ro, i) + ' (to env)',
                            c=colors[i])
                axs[i].plot(t,
                            act_clipped[:, i],
                            label=_get_act_label(ro, i) + ' (clipped)',
                            c='k',
                            ls='--')
                axs[i].legend(bbox_to_anchor=(0, 1.0, 1, -0.1),
                              loc='lower left',
                              mode='expand',
                              ncol=2)

        plt.subplots_adjust(hspace=1.2)
        plt.show()
示例#2
0
    def eval_policy(save_dir: [str, None],
                    env: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy,
                    mc_estimator: bool,
                    prefix: str,
                    num_rollouts: int,
                    num_parallel_envs: int = 1) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env, policy, eval=True).undiscounted_return()
                # If a reward of -1 is given, skip evaluation ahead and set all returns to zero
                if rets_real[i] == -1:
                    print_cbt('Set all returns for this policy to zero.', color='c')
                    rets_real = to.zeros(num_rollouts)
                    break
        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('Target domain performance', bright=True)
            print(tabulate([['mean return', to.mean(rets_real).item()],
                            ['std return', to.std(rets_real)],
                            ['min return', to.min(rets_real)],
                            ['max return', to.max(rets_real)]]))

        if mc_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean,
                                              num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
示例#3
0
def cpp_export(
    save_dir: pyrado.PathLike,
    policy: Policy,
    env: Optional[SimEnv] = None,
    policy_export_name: str = "policy_export",
    write_policy_node: bool = True,
    policy_node_name: str = "policy",
):
    """
    Convenience function to export the policy using PyTorch's scripting or tracing, and the experiment's XML
    configuration if the environment from RcsPySim.

    :param save_dir: directory to save in
    :param policy: (trained) policy
    :param env: environment the policy was trained in
    :param policy_export_name: name of the exported policy file without the file type ending
    :param write_policy_node: if `True`, write the PyTorch-based control policy into the experiment's XML configuration.
                              This requires the experiment's XML configuration to be exported beforehand.
    :param policy_node_name: name of the control policies node in the XML file, e.g. 'policy' or 'preStrikePolicy'
    """
    if not osp.isdir(save_dir):
        raise pyrado.PathErr(given=save_dir)
    if not isinstance(policy, Policy):
        raise pyrado.TypeErr(given=policy, expected_type=Policy)
    if not isinstance(policy_export_name, str):
        raise pyrado.TypeErr(given=policy_export_name, expected_type=str)

    # Use torch.jit.trace / torch.jit.script (the latter if recurrent) to generate a torch.jit.ScriptModule
    ts_module = policy.double().script(
    )  # can be evaluated like a regular PyTorch module

    # Serialize the script module to a file and save it in the same directory we loaded the policy from
    policy_export_file = osp.join(save_dir, f"{policy_export_name}.pt")
    ts_module.save(policy_export_file)  # former .zip, and before that .pth
    print_cbt(f"Exported the loaded policy to {policy_export_file}",
              "g",
              bright=True)

    # Export the experiment config for C++
    exp_export_file = osp.join(save_dir, "ex_config_export.xml")
    if env is not None and isinstance(inner_env(env), RcsSim):
        inner_env(env).save_config_xml(exp_export_file)
        print_cbt(f"Exported experiment configuration to {exp_export_file}",
                  "g",
                  bright=True)

    # Open the XML file again to add the policy node
    if write_policy_node and osp.isfile(exp_export_file):
        tree = et.parse(exp_export_file)
        root = tree.getroot()
        policy_node = et.Element(policy_node_name)
        policy_node.set("type", "torch")
        policy_node.set("file", f"{policy_export_name}.pt")
        root.append(policy_node)
        tree.write(exp_export_file)
        print_cbt(
            f"Added {policy_export_name}.pt to the experiment configuration.",
            "g")
示例#4
0
    def eval_policy(
        save_dir: Optional[pyrado.PathLike],
        env: Env,
        policy: Policy,
        prefix: str,
        num_rollouts: int,
        num_workers: int = 1,
    ) -> to.Tensor:
        """
        Evaluate a policy either in the source or in the target domain.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_workers: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f"Executing {prefix}_policy ...", "c", bright=True)

        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when evaluating on a real-world device
            rets_real = []
            for i in range(num_rollouts):
                rets_real.append(
                    rollout(env, policy, eval=True).undiscounted_return())

        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when evaluating in a simulation
            sampler = ParallelRolloutSampler(env,
                                             policy,
                                             num_workers=num_workers,
                                             min_rollouts=num_rollouts)
            ros = sampler.sample(eval=True)
            rets_real = [ro.undiscounted_return() for ro in ros]
        else:
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv])

        rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype())

        if save_dir is not None:
            # Save and print the evaluation results
            pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix)
            print_cbt("Target domain performance", bright=True)
            print(
                tabulate([
                    ["mean return", to.mean(rets_real).item()],
                    ["std return", to.std(rets_real)],
                    ["min return", to.min(rets_real)],
                    ["max return", to.max(rets_real)],
                ]))

        return to.mean(rets_real)
示例#5
0
 def step(self, act: np.ndarray) -> tuple:
     obs, reward, done, info = self.wrapped_env.step(act)
     saw = typed_env(self.wrapped_env, StateAugmentationWrapper)
     nonobserved = to.from_numpy(obs[saw.offset :])
     adversarial = self.get_arpl_grad(self.state, nonobserved)
     if self.decide_apply():
         self.state += adversarial.view(-1).numpy()
     if saw:
         obs[: saw.offset] = inner_env(self).observe(self.state)
     else:
         obs = inner_env(self).observe(self.state)
     return obs, reward, done, info
示例#6
0
def eval_damping():
    """ Plot joint trajectories for different joint damping parameters """
    # Load experiment and remove possible randomization wrappers
    ex_dir = ask_for_experiment()
    env, policy, _ = load_experiment(ex_dir)
    env = inner_env(env)
    env.domain_param = WAMBallInCupSim.get_nominal_domain_param()

    data = []
    t = []
    dampings = [0., 1e-2, 1e-1, 1e0]
    print_cbt(f'Run policy for damping coefficients: {dampings}')
    for d in dampings:
        env.reset(domain_param=dict(joint_damping=d))
        ro = rollout(env,
                     policy,
                     render_mode=RenderMode(video=False),
                     eval=True)
        t.append(ro.env_infos['t'])
        data.append(ro.env_infos['qpos'])

    fig, ax = plt.subplots(3, sharex='all')
    ls = ['k-', 'b--', 'g-.', 'r:']  # line style setting for better visibility
    for i, idx in enumerate([1, 3, 5]):
        for j in range(len(dampings)):
            ax[i].plot(t[j],
                       data[j][:, idx],
                       ls[j],
                       label=f'damping: {dampings[j]}')
            if i == 0:
                ax[i].legend()
        ax[i].set_ylabel(f'joint {idx} pos [rad]')
    ax[2].set_xlabel('time [s]')
    plt.suptitle('Evaluation of joint damping coefficient')
    plt.show()
示例#7
0
 def _params_as_tensor(self):
     if self.fixed:
         return self._nominal
     else:
         return np.array([
             inner_env(self.wrapped_env).domain_param[k]
             for k in self._params
         ])
示例#8
0
def test_domain_param_transforms(env: SimEnv, trafo_class: Type):
    pyrado.set_seed(0)

    # Create a mask for a random domain parameter
    offset = 1
    idx = random.randint(0, len(env.supported_domain_param) - 1)
    sel_dp_change = list(env.supported_domain_param)[idx]
    sel_dp_fix = list(
        env.supported_domain_param)[(idx + offset) %
                                    len(env.supported_domain_param)]
    while (offset == 1 or any([
            item in sel_dp_change for item in VORTEX_ONLY_DOMAIN_PARAM_LIST
    ]) or any([item in sel_dp_fix for item in VORTEX_ONLY_DOMAIN_PARAM_LIST])):
        idx = random.randint(0, len(env.supported_domain_param) - 1)
        sel_dp_change = list(env.supported_domain_param)[idx]
        sel_dp_fix = list(
            env.supported_domain_param)[(idx + offset) %
                                        len(env.supported_domain_param)]
        offset += 1

    mask = (sel_dp_change, )
    wenv = trafo_class(env, mask)
    assert isinstance(wenv, DomainParamTransform)

    # Check 5 random values
    for _ in range(5):
        # Change the selected domain parameter
        new_dp_val = random.random() * env.get_nominal_domain_param(
        )[sel_dp_change]
        new_dp_val = abs(
            new_dp_val) + 1e-6  # due to the domain of the new params
        transformed_new_dp_val = wenv.forward(new_dp_val)
        wenv.domain_param = {
            sel_dp_change: transformed_new_dp_val
        }  # calls inverse transform
        if not isinstance(inner_env(wenv), SimPyEnv):
            wenv.reset(
            )  # the RcsPySim and MujocoSim classes need to be reset to apply the new domain param

        # Test the actual domain param and the the getters
        assert inner_env(wenv)._domain_param[sel_dp_change] == pytest.approx(
            new_dp_val, abs=1e-5)
        assert wenv.domain_param[sel_dp_change] == pytest.approx(new_dp_val,
                                                                 abs=1e-5)
        assert wenv.domain_param[sel_dp_fix] != pytest.approx(new_dp_val)
示例#9
0
 def __init__(self, env, params: Sequence[str] = None):
     self._params = None
     if isinstance(params, list) and len(params) == 0:
         params = None
     self._all_nominal = inner_env(env).get_nominal_domain_param()
     if params is not None:
         self.params = params
     else:
         self.params = self._all_nominal.keys()
示例#10
0
    def __init__(self, wrapped_env: Env, domain_param: Sequence[str] = None, fixed: bool = False):
        """
        Constructor

        :param wrapped_env: the environment to be wrapped
        :param domain_param: list of domain parameter names to include in the observation, pass `None` to select all
        :param fixed: fix the parameters
        """
        Serializable._init(self, locals())

        EnvWrapper.__init__(self, wrapped_env)
        if domain_param is not None:
            self._params = domain_param
        else:
            self._params = list(inner_env(self.wrapped_env).domain_param.keys())
        self._nominal = inner_env(self.wrapped_env).get_nominal_domain_param()
        self._nominal = np.array([self._nominal[k] for k in self._params])
        self.fixed = fixed
示例#11
0
    def __init__(self, wrapped_env: Env, params=None, fixed=False):
        """
        Constructor TODO

        :param wrapped_env:
        :param params:
        :param fixed:
        """
        Serializable._init(self, locals())

        EnvWrapper.__init__(self, wrapped_env)
        if params is not None:
            self._params = params
        else:
            self._params = list(
                inner_env(self.wrapped_env).domain_param.keys())
        self._nominal = inner_env(self.wrapped_env).get_nominal_domain_param()
        self.nominal[
            'dt'] = 1 / 100.  # TODO ATTENTION! THIS CAN BE DEADLY! @Robin, why are you doing this?
        self._nominal = np.array([self._nominal[k] for k in self._params])
        self.fixed = fixed
示例#12
0
def test_combination():
    env = QCartPoleSwingUpSim(dt=1/50., max_steps=20)

    randomizer = create_default_randomizer(env)
    env_r = DomainRandWrapperBuffer(env, randomizer)
    env_r.fill_buffer(num_domains=3)

    dp_before = []
    dp_after = []
    for i in range(4):
        dp_before.append(env_r.domain_param)
        rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode())
        dp_after.append(env_r.domain_param)
        assert dp_after[i] != dp_before[i]
    assert dp_after[0] == dp_after[3]

    env_rn = ActNormWrapper(env)
    elb = {'x_dot': -213., 'theta_dot': -42.}
    eub = {'x_dot': 213., 'theta_dot': 42., 'x': 0.123}
    env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub)
    alb, aub = env_rn.act_space.bounds
    assert all(alb == -1)
    assert all(aub == 1)
    olb, oub = env_rn.obs_space.bounds
    assert all(olb == -1)
    assert all(oub == 1)

    ro_r = rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode())
    ro_rn = rollout(env_rn, DummyPolicy(env_rn.spec), eval=True, seed=0, render_mode=RenderMode())
    assert np.allclose(env_rn._process_obs(ro_r.observations), ro_rn.observations)

    env_rnp = ObsPartialWrapper(env_rn, idcs=['x_dot', r'cos_theta'])
    ro_rnp = rollout(env_rnp, DummyPolicy(env_rnp.spec), eval=True, seed=0, render_mode=RenderMode())

    env_rnpa = GaussianActNoiseWrapper(env_rnp,
                                       noise_mean=0.5*np.ones(env_rnp.act_space.shape),
                                       noise_std=0.1*np.ones(env_rnp.act_space.shape))
    ro_rnpa = rollout(env_rnpa, DummyPolicy(env_rnpa.spec), eval=True, seed=0, render_mode=RenderMode())
    assert np.allclose(ro_rnp.actions, ro_rnpa.actions)
    assert not np.allclose(ro_rnp.observations, ro_rnpa.observations)

    env_rnpd = ActDelayWrapper(env_rnp, delay=3)
    ro_rnpd = rollout(env_rnpd, DummyPolicy(env_rnpd.spec), eval=True, seed=0, render_mode=RenderMode())
    assert np.allclose(ro_rnp.actions, ro_rnpd.actions)
    assert not np.allclose(ro_rnp.observations, ro_rnpd.observations)

    assert isinstance(inner_env(env_rnpd), QCartPoleSwingUpSim)
    assert typed_env(env_rnpd, ObsPartialWrapper) is not None
    assert isinstance(env_rnpd, ActDelayWrapper)
    env_rnpdr = remove_env(env_rnpd, ActDelayWrapper)
    assert not isinstance(env_rnpdr, ActDelayWrapper)
示例#13
0
def test_bayrn_power(ex_dir, env: SimEnv, bayrn_hparam: dict):
    pyrado.set_seed(0)

    # Environments and domain randomization
    env_real = deepcopy(env)
    env_sim = DomainRandWrapperLive(env, create_zero_var_randomizer(env))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)
    env_real.domain_param = dict(mass_pend_pole=0.024 * 1.1,
                                 mass_rot_pole=0.095 * 1.1)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy and subroutine
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
    subrtn_hparam = dict(
        max_iter=1,
        pop_size=8,
        num_init_states_per_domain=1,
        num_is_samples=4,
        expl_std_init=0.1,
        num_workers=1,
    )
    subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam)

    # Set the boundaries for the GP
    dp_nom = inner_env(env_sim).get_nominal_domain_param()
    ddp_space = BoxSpace(
        bound_lo=np.array([
            0.8 * dp_nom["mass_pend_pole"], 1e-8,
            0.8 * dp_nom["mass_rot_pole"], 1e-8
        ]),
        bound_up=np.array([
            1.2 * dp_nom["mass_pend_pole"], 1e-7,
            1.2 * dp_nom["mass_rot_pole"], 1e-7
        ]),
    )

    # Create algorithm and train
    algo = BayRn(ex_dir,
                 env_sim,
                 env_real,
                 subrtn,
                 ddp_space,
                 **bayrn_hparam,
                 num_workers=1)
    algo.train()

    assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
示例#14
0
 def get_arpl_grad(self, state, nonobserved):
     if isinstance(state, np.ndarray):
         state_tensor = to.tensor(state, requires_grad=True)
     elif isinstance(state, to.Tensor):
         state_tensor = state
     else:
         raise ValueError("state could not be converted to a torch tensor")
     if self.torch_observation:
         observation = inner_env(self).observe(state_tensor, dtype=to.Tensor)
     else:
         observation = state_tensor
     mean_arpl = self._policy.forward(to.cat((observation, nonobserved)))
     l2_norm_mean = -to.norm(mean_arpl, p=2, dim=0)
     l2_norm_mean.backward()
     state_grad = state_tensor.grad
     return self._eps * to.sign(state_grad)
示例#15
0
    def __init__(self,
                 env: Env,
                 policy: Policy,
                 num_workers: int,
                 num_rollouts_per_param: int,
                 seed: int = None):
        """
        Constructor

        :param env: environment to sample from
        :param policy: policy used for sampling
        :param num_workers: number of parallel samplers
        :param num_rollouts_per_param: number of rollouts per policy parameter set (and init state if specified)
        :param seed: seed value for the random number generators, pass `None` for no seeding
        """
        if not isinstance(num_rollouts_per_param, int):
            raise pyrado.TypeErr(given=num_rollouts_per_param,
                                 expected_type=int)
        if num_rollouts_per_param < 1:
            raise pyrado.ValueErr(given=num_rollouts_per_param,
                                  ge_constraint='1')

        Serializable._init(self, locals())

        # Check environment for domain randomization wrappers (stops after finding the outermost)
        self._dr_wrapper = typed_env(env, DomainRandWrapper)
        if self._dr_wrapper is not None:
            assert isinstance(inner_env(env), SimEnv)
            # Remove them all from the env chain since we sample the domain parameter later explicitly
            env = remove_all_dr_wrappers(env)

        self.env, self.policy = env, policy
        self.num_rollouts_per_param = num_rollouts_per_param

        # Create parallel pool. We use one thread per environment because it's easier.
        self.pool = SamplerPool(num_workers)

        # Set all rngs' seeds
        if seed is not None:
            self.pool.set_seed(seed)

        # Distribute environments. We use pickle to make sure a copy is created for n_envs = 1
        self.pool.invoke_all(_pes_init, pickle.dumps(self.env),
                             pickle.dumps(self.policy))
示例#16
0
def create_default_randomizer(env: [SimEnv, EnvWrapper]) -> DomainRandomizer:
    """
    Create the default randomizer depending on the passed environment.

    :param env: (wrapped) environment that should be perturbed
    :return: default randomizer
    """
    env_type = type(inner_env(env))

    # Try all env base types. This is more or less equivalent to isinstance
    for cand_type in env_type.__mro__:
        env_module = cand_type.__module__
        env_class = cand_type.__name__
        # Try to get it
        dp = default_randomizer_registry.get((env_module, env_class))
        if dp:
            return dp()
    else:
        raise ValueError(f'No default randomizer settings for env of type {env_type}!')
示例#17
0
    def __init__(self, wrapped_env: Union[SimEnv, EnvWrapper], randomizer: Optional[DomainRandomizer]):
        """
        Constructor

        :param wrapped_env: environment to wrap
        :param randomizer: `DomainRandomizer` object holding the probability distribution of all randomizable
                            domain parameters, pass `None` if you want to subclass wrapping another `DomainRandWrapper`
                            and use its randomizer
        """
        if not isinstance(inner_env(wrapped_env), SimEnv):
            raise pyrado.TypeErr(given=wrapped_env, expected_type=SimEnv)
        if not isinstance(randomizer, DomainRandomizer) and randomizer is not None:
            raise pyrado.TypeErr(given=randomizer, expected_type=DomainRandomizer)

        Serializable._init(self, locals())

        # Invoke EnvWrapper's constructor
        super().__init__(wrapped_env)

        self._randomizer = randomizer
示例#18
0
from pyrado.plotting.distribution import draw_posterior_pairwise_scatter
from pyrado.utils.argparser import get_argparser

if __name__ == "__main__":
    # Parse command line arguments
    args = get_argparser().parse_args()
    plt.rc("text", usetex=args.use_tex)
    if not isinstance(args.num_samples, int) or args.num_samples < 1:
        raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1")

    # NPDR
    ex_dir_npdr = os.path.join(pyrado.TEMP_DIR, "mg-ik", "npdr_time", "")
    algo = Algorithm.load_snapshot(ex_dir_npdr)
    if not isinstance(algo, NPDR):
        raise pyrado.TypeErr(given=algo, expected_type=NPDR)
    env_sim = inner_env(pyrado.load("env_sim.pkl", ex_dir_npdr))
    prior_npdr = pyrado.load("prior.pt", ex_dir_npdr)
    posterior_npdr = algo.load_posterior(ex_dir_npdr,
                                         idx_iter=0,
                                         idx_round=6,
                                         obj=None,
                                         verbose=True)  # CHOICE
    data_real_npdr = pyrado.load(f"data_real.pt",
                                 ex_dir_npdr,
                                 prefix="iter_0",
                                 verbose=True)  # CHOICE
    domain_params_npdr, log_probs = SBIBase.eval_posterior(
        posterior_npdr,
        data_real_npdr,
        args.num_samples,
        normalize_posterior=False,  # not necessary here
示例#19
0
    def train_policy_sim(self,
                         domain_params: to.Tensor,
                         prefix: str,
                         cnt_rep: int,
                         use_rec_init_states: bool = True) -> float:
        """
        Train a policy in simulation for given hyper-parameters from the domain randomizer.

        :param domain_params: domain parameters sampled from the posterior [shape N x D where N is the number of
                              samples and D is the number of domain parameters]
        :param prefix: set a prefix to the saved file name, use "" for no prefix
        :param cnt_rep: current repetition count, coming from the wrapper function
        :param use_rec_init_states: if `True`, the previous rollout will be loaded to extract the initial states, and
                                    sync them with the recorded ones
        :return: estimated return of the trained policy in the target domain
        """
        if not (domain_params.ndim == 2
                and domain_params.shape[1] == len(self.dp_mapping)):
            raise pyrado.ShapeErr(given=domain_params,
                                  expected_match=(-1, len(self.dp_mapping)))

        # Insert the domain parameters into the wrapped environment's buffer
        self.fill_domain_param_buffer(self._env_sim_trn, self.dp_mapping,
                                      domain_params)

        # Set the initial state spaces of the simulation environment to match the observed initial states
        if use_rec_init_states:
            rollouts_real = pyrado.load("rollouts_real.pkl",
                                        self._save_dir,
                                        prefix=prefix)
            init_states_real = np.stack(
                [ro.states[0, :] for ro in rollouts_real])
            if not init_states_real.shape == (
                    len(rollouts_real),
                    self._env_sim_trn.state_space.flat_dim):
                raise pyrado.ShapeErr(
                    given=init_states_real,
                    expected_match=(len(rollouts_real),
                                    self._env_sim_trn.state_space.flat_dim))
            inner_env(
                self._env_sim_trn).init_space = DiscreteSpace(init_states_real)
            print_cbt(
                "The simulation environment's initial states have been set to the recorded ones.",
                "w")

        # Reset the subroutine algorithm which includes resetting the exploration
        self._cnt_samples += self._subrtn_policy.sample_count
        self._subrtn_policy.reset()

        # Propagate the updated training environment to the SamplerPool's workers
        if hasattr(self._subrtn_policy, "sampler"):
            self._subrtn_policy.sampler.reinit(env=self._env_sim_trn)
        else:
            raise pyrado.KeyErr(keys="sampler", container=self._subrtn_policy)

        # Do a warm start, but randomly reset the policy parameters if training failed once
        self._subrtn_policy.init_modules(self.warmstart and cnt_rep == 0)

        # Train a policy in simulation using the subroutine
        self._subrtn_policy.train(
            snapshot_mode=self._subrtn_policy_snapshot_mode,
            meta_info=dict(prefix=prefix))

        # Return the estimated return of the trained policy in simulation
        assert len(self._env_sim_trn.buffer) == self.num_eval_samples
        self._env_sim_trn.ring_idx = 0  # don't reset the buffer to eval on the same domains as trained
        avg_ret_sim = self.eval_policy(None, self._env_sim_trn,
                                       self._subrtn_policy.policy, prefix,
                                       self.num_eval_samples)
        return float(avg_ret_sim)
示例#20
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env_sim: SimEnv,
        env_real: Union[Env, str],
        policy: Policy,
        dp_mapping: Mapping[int, str],
        prior: Distribution,
        embedding: Embedding,
        num_checkpoints: int,
        init_checkpoint: int,
        max_iter: int,
        num_real_rollouts: int,
        num_sim_per_round: int,
        num_segments: int = None,
        len_segments: int = None,
        stop_on_done: bool = True,
        use_rec_act: bool = True,
        num_sbi_rounds: int = 1,
        reset_sbi_routine_each_iter: bool = False,
        reset_proposal_each_iter: bool = False,
        num_eval_samples: Optional[int] = None,
        posterior_hparam: Optional[dict] = None,
        subrtn_sbi_training_hparam: Optional[dict] = None,
        subrtn_sbi_sampling_hparam: Optional[dict] = None,
        simulation_batch_size: int = 1,
        normalize_posterior: bool = True,
        subrtn_policy: Optional[Algorithm] = None,
        subrtn_policy_snapshot_mode: str = "latest",
        train_initial_policy: bool = True,
        thold_succ_subrtn: float = -pyrado.inf,
        warmstart: bool = True,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain, this can be a `RealEnv` (sim-to-real setting), a
                         `SimEnv` (sim-to-sim setting), or a directory to load a pre-recorded set of rollouts from
        :param policy: policy used for sampling the rollout, if subrtn_policy is not `None` this policy is not oly used
                       for generating the target domain rollouts, but also optimized in simulation
        :param dp_mapping: mapping from subsequent integers (starting at 0) to domain parameter names (e.g. mass)
        :param prior: distribution used by sbi as a prior
        :param embedding: embedding used for pre-processing the data before passing it to the posterior
        :param num_checkpoints: total number of checkpoints
        :param init_checkpoint: initial value of the cyclic counter, defaults to 0, use negative values can to mark
                                sections that should only be executed once
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_real_rollouts: number of real-world rollouts received by sbi, i.e. from every rollout exactly one
                                  data set is computed
        :param num_sim_per_round: number of simulations done by sbi per round (i.e. iteration over the same target domain data set)
        :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                            state of the simulation is reset, and thus for every set the features of the trajectories
                            are computed separately. Either specify `num_segments` or `len_segments`.
        :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param stop_on_done: if `True`, the rollouts are stopped as soon as they hit the state or observation space
                             boundaries. This behavior is save, but can lead to short trajectories which are eventually
                             padded with zeroes. Chose `False` to ignore the boundaries (dangerous on the real system).
        :param use_rec_act: if `True` the recorded actions form the target domain are used to generate the rollout
                            during simulation (feed-forward). If `False` there policy is used to generate (potentially)
                            state-dependent actions (feed-back).
        :param reset_sbi_routine_each_iter: if `True` the sbi subroutine instance is recreated every iteration.
                                            Use this flag to train the posterior each iteration from scratch.
        :param num_sbi_rounds: set to an integer > 1 to use multi-round sbi. This way the posteriors (saved as
                               `..._round_NUMBER...` will be tailored to the data of that round, where `NUMBER`
                               counts up each round (modulo `num_real_rollouts`). If `num_sbi_rounds` = 1, the posterior
                               is called amortized (it has never seen any target domain data).
        :param num_eval_samples: number of samples for evaluating the posterior in `eval_posterior()`
        :param posterior_hparam: hyper parameters for creating the posterior's density estimator
        :param subrtn_sbi_training_hparam: dict forwarded to sbi's `PosteriorEstimator.train()` function like
                                           `training_batch_size`, `learning_rate`, `retrain_from_scratch_each_round`, ect.
        :param simulation_batch_size: batch size forwarded to the sbi toolbox, requires batched simulator
        :param normalize_posterior: if `True` the normalization of the posterior density is enforced by sbi
        :param subrtn_policy: algorithm which performs the optimization of the behavioral policy (and value-function)
        :param subrtn_policy_snapshot_mode: snapshot mode for saving during policy optimization
        :param train_initial_policy: choose if a policy should be pretrained in the first iteration before collecting
                                     real rollouts. Choose `False`, if you want to use a pre-defined policy.
        :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the
                                  subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy (and value function) parameters with the one of the previous iteration.
                          This behavior can also be overruled by passing `init_policy_params` (and
                          `valuefcn_param_init`) explicitly.
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(inner_env(env_sim), SimEnv) or (
                isinstance(env_sim, DomainRandWrapper)
                and not isinstance(env_sim, ActDelayWrapper)):
            raise pyrado.TypeErr(
                msg=
                "The given env_sim must be a non-randomized simulation environment, "
                "except for wrappers that add a domain parameter!")
        if isinstance(prior, Normal):
            raise pyrado.TypeErr(
                msg=
                "The sbi framework requires MultivariateNormal instead of Normal distributions for the prior."
            )
        if not prior.event_shape[0] == len(dp_mapping):
            raise pyrado.ShapeErr(given=prior.event_shape,
                                  expected_match=dp_mapping)
        if posterior_hparam is None:
            posterior_hparam = dict()
        elif not isinstance(posterior_hparam, dict):
            raise pyrado.TypeErr(given=posterior_hparam, expected_type=dict)
        if subrtn_sbi_training_hparam is None:
            subrtn_sbi_training_hparam = dict()
        elif not isinstance(subrtn_sbi_training_hparam, dict):
            raise pyrado.TypeErr(given=subrtn_sbi_training_hparam,
                                 expected_type=dict)

        # Call InterruptableAlgorithm's constructor
        super().__init__(
            num_checkpoints=num_checkpoints,
            init_checkpoint=init_checkpoint,
            save_dir=save_dir,
            max_iter=max_iter,
            policy=policy,
            logger=logger,
        )

        self._env_sim_sbi = env_sim  # will be randomized explicitly by sbi
        self._env_sim_trn = DomainRandWrapperBuffer(copy.deepcopy(env_sim),
                                                    randomizer=None,
                                                    selection="cyclic")
        self._env_real = env_real
        self.dp_mapping = dp_mapping
        self._embedding = embedding
        self.num_sim_per_round = num_sim_per_round
        self.num_real_rollouts = num_real_rollouts
        self.num_segments = num_segments
        self.len_segments = len_segments
        self.stop_on_done = stop_on_done
        self.use_rec_act = use_rec_act
        self.reset_sbi_routine_each_iter = reset_sbi_routine_each_iter
        self.reset_proposal_each_iter = reset_proposal_each_iter
        self.num_sbi_rounds = num_sbi_rounds
        self.num_eval_samples = num_eval_samples or 10 * 2**len(dp_mapping)
        self.simulation_batch_size = simulation_batch_size
        self.normalize_posterior = normalize_posterior
        self._subrtn_sbi = None
        self.subrtn_sbi_training_hparam = subrtn_sbi_training_hparam or dict()
        self.posterior_hparam = posterior_hparam or dict()
        self.thold_succ_subrtn = float(thold_succ_subrtn)
        self.max_subrtn_rep = 3  # number of tries to exceed thold_succ_subrtn during training in simulation
        self.warmstart = warmstart
        self.num_workers = int(num_workers)

        # Temporary containers
        self._curr_data_real = None
        self._curr_domain_param_eval = None

        # Initialize sbi simulator and prior
        self._sbi_simulator = None  # to be set in step()
        self._sbi_prior = None  # to be set in step()
        self._setup_sbi(prior=prior)

        # Optional policy optimization subroutine
        self._subrtn_policy = subrtn_policy
        if isinstance(self._subrtn_policy, Algorithm):
            self._subrtn_policy_snapshot_mode = subrtn_policy_snapshot_mode
            self._subrtn_policy.save_name = "subrtn_policy"
            self._train_initial_policy = train_initial_policy
            # Check that the behavioral policy is the one that is being updated
            if self._subrtn_policy.policy is not self.policy:
                raise pyrado.ValueErr(
                    msg=
                    "The policy is the policy subroutine is not the same as the one used by "
                    "the system identification (sbi) subroutine!")

        # Save initial environments, the embedding, and the prior
        pyrado.save(self._env_sim_trn, "env_sim.pkl", self._save_dir)
        pyrado.save(self._env_real, "env_real.pkl", self._save_dir)
        pyrado.save(embedding, "embedding.pt", self._save_dir)
        pyrado.save(prior, "prior.pt", self._save_dir)
        pyrado.save(policy,
                    "init_policy.pt",
                    self._save_dir,
                    use_state_dict=True)
示例#21
0
    def collect_data_real(
        save_dir: Optional[pyrado.PathLike],
        env: Union[Env, str],
        policy: Policy,
        embedding: Embedding,
        num_rollouts: int,
        num_segments: int = None,
        len_segments: int = None,
        prefix: str = "",
    ) -> Tuple[to.Tensor, List[StepSequence]]:
        """
        Roll-out a (behavioral) policy on the target system for later use with the sbi module, and save the data
        computed from the recorded rollouts.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance,
                    in case you want to use pre-recorded rollouts pass the path to the parent folder as string
        :param policy: policy to evaluate
        :param embedding: embedding used for pre-processing the data before passing it to the posterior
        :param num_rollouts: number of rollouts to collect on the target system
        :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :return: data from the real-world rollouts a.k.a. set of $x_o$ of shape [num_iter, num_rollouts_per_iter,
                 time_series_length, dim_data], and the real-world rollouts
        """
        if not (isinstance(inner_env(env), RealEnv)
                or isinstance(inner_env(env), SimEnv) or isinstance(env, str)):
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv, str])

        # Evaluate sequentially (necessary for sim-to-real experiments)
        if isinstance(env, str):
            rollout_worker = RecRolloutSamplerForSBI(env,
                                                     embedding,
                                                     num_segments,
                                                     len_segments,
                                                     rand_init_rollout=False)
        else:
            rollout_worker = RealRolloutSamplerForSBI(env, policy, embedding,
                                                      num_segments,
                                                      len_segments)

        # Initialize data containers
        data_real = None
        rollouts_real = None
        num_found_rollouts = 0
        if save_dir is not None:
            try:
                data_real = pyrado.load("data_real.pt",
                                        save_dir,
                                        prefix=prefix)
                rollouts_real = pyrado.load("rollouts_real.pkl",
                                            save_dir,
                                            prefix=prefix)
                if not data_real.shape[0] == len(rollouts_real):
                    raise pyrado.ShapeErr(
                        msg=
                        f"Found {data_real.shape[0]} entries in data_real.pt, but {len(rollouts_real)} rollouts in "
                        f"rollouts_real.pkl!")
                num_found_rollouts = len(rollouts_real)
                print_cbt(
                    f"Found {num_found_rollouts} rollout(s) in {save_dir}.",
                    "w")
            except FileNotFoundError:
                pass  # in the first attempt no files can be found

        collect_str = f"Collecting data" if prefix == "" else f"Collecting data using {prefix}_policy"
        for _ in tqdm(
                range(num_found_rollouts, num_rollouts),
                total=num_rollouts,
                desc=Fore.CYAN + Style.BRIGHT + collect_str + Style.RESET_ALL,
                unit="rollouts",
                file=sys.stdout,
        ):
            # Do the rollout
            data, rollout = rollout_worker()

            # Fill data container
            if data_real is None or rollouts_real is None:
                data_real = data  # data is of shape [1, dim_feat]
                rollouts_real = [rollout]
            else:
                data_real = to.cat(
                    [data_real, data],
                    dim=1)  # stack to final shape [1, num_rollouts * dim_feat]
                rollouts_real.append(rollout)

            # Optionally save the data (do this at every iteration to continue)
            if save_dir is not None:
                pyrado.save(data_real, "data_real.pt", save_dir, prefix=prefix)
                pyrado.save(rollouts_real,
                            "rollouts_real.pkl",
                            save_dir,
                            prefix=prefix)

        if data_real.shape != (1, num_rollouts * embedding.dim_output):
            raise pyrado.ShapeErr(given=data_real,
                                  expected_match=(1, num_rollouts *
                                                  embedding.dim_output))

        return data_real, rollouts_real
示例#22
0
        # init_state = np.array([1/np.sqrt(2), -1/np.sqrt(2)])
        # init_state = np.array([0., -1.])
        # init_state = np.array([-1/np.sqrt(2), -1/np.sqrt(2)])
        # init_state = np.array([-1., 0.])
        # init_state = np.array([-1/np.sqrt(2), 1/np.sqrt(2)])
        init_state = np.array([0., 1.])
        init_state *= 0.103125  # distance scaling [m]
    pdctrl.reset(state_des=init_state)
    print_cbt(
        f'Set up the PD-controller for the QBallBalancerReal environment.\nDesired state: {init_state}',
        'c')

    ros = []
    for r in range(args.num_runs):
        # Run PD-controller on the device to get the ball into position
        env_real = inner_env(env_real)  # since we are reusing it
        print_cbt('Running the PD-controller ...', 'c', bright=True)
        rollout(env_real,
                pdctrl,
                eval=True,
                max_steps=2000,
                render_mode=RenderMode())
        env_real.reset()

        # Wrap the real environment in the same way as done during training (do this after the PD controller finished)
        env_real = wrap_like_other_env(env_real, env_sim)

        # Run learned policy on the device
        print_cbt('Running the evaluation policy ...', 'c', bright=True)
        ros.append(
            rollout(env_real,
示例#23
0
from pyrado.utils.experiments import wrap_like_other_env, load_experiment
from pyrado.utils.input_output import print_cbt
from pyrado.utils.argparser import get_argparser

if __name__ == '__main__':
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment()

    # Load the policy (trained in simulation) and the environment (for constructing the real-world counterpart)
    env_sim, policy, _ = load_experiment(ex_dir)

    # Detect the correct real-world counterpart and create it
    if isinstance(inner_env(env_sim), QBallBalancerSim):
        env_real = QBallBalancerReal(dt=args.dt, max_steps=args.max_steps)
    elif isinstance(inner_env(env_sim), QCartPoleSim):
        env_real = QCartPoleReal(dt=args.dt, max_steps=args.max_steps)
    elif isinstance(inner_env(env_sim), QQubeSim):
        env_real = QQubeReal(dt=args.dt, max_steps=args.max_steps)
    else:
        raise pyrado.TypeErr(
            given=env_sim,
            expected_type=[QBallBalancerSim, QCartPoleSim, QQubeSim])
    print_cbt(f'Set up env {env_real.name}.', 'c')

    # Finally wrap the env in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run on device
示例#24
0
 def set_adv(self, params):
     for key, value in zip(self._params, params):
         inner_env(self.wrapped_env).domain_param[key] = self._nominal[key] + value
示例#25
0
from pyrado.utils.experiments import wrap_like_other_env, load_experiment
from pyrado.utils.input_output import print_cbt
from pyrado.utils.argparser import get_argparser

if __name__ == '__main__':
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment()

    # Load the policy (trained in simulation) and the environment (for constructing the real-world counterpart)
    env_sim, policy, _ = load_experiment(ex_dir)

    # Detect the correct real-world counterpart and create it
    if isinstance(inner_env(env_sim), WAMBallInCupSim):
        # If `max_steps` (or `dt`) are not explicitly set using `args`, use the same as in the simulation
        max_steps = args.max_steps if args.max_steps < pyrado.inf else env_sim.max_steps
        dt = args.dt if args.dt is not None else env_sim.dt
        env_real = WAMBallInCupReal(dt=dt, max_steps=max_steps)
    else:
        raise pyrado.TypeErr(given=env_sim, expected_type=WAMBallInCupSim)

    # Finally wrap the env in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run on device
    done = False
    while not done:
        ro = rollout(env_real, policy, eval=True)
        print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
示例#26
0
def plot_actions(ro: StepSequence, env: Env):
    """
    Plot all action trajectories of the given rollout.

    :param ro: input rollout
    :param env: environment (used for getting the clipped action values)
    """
    if hasattr(ro, "actions"):
        if not isinstance(ro.actions, np.ndarray):
            raise pyrado.TypeErr(given=ro.actions, expected_type=np.ndarray)

        dim_act = ro.actions.shape[1]
        # Use recorded time stamps if possible
        t = getattr(ro, "time", np.arange(0, ro.length + 1))[:-1]

        num_rows, num_cols = num_rows_cols_from_length(dim_act,
                                                       transposed=True)
        fig, axs = plt.subplots(num_rows,
                                num_cols,
                                figsize=(10, 8),
                                tight_layout=True)
        fig.canvas.manager.set_window_title("Actions over Time")
        axs = np.atleast_2d(axs)
        axs = correct_atleast_2d(axs)
        colors = plt.get_cmap("tab20")(np.linspace(0, 1, dim_act))

        act_norm_wrapper = typed_env(env, ActNormWrapper)
        if act_norm_wrapper is not None:
            lb, ub = inner_env(env).act_space.bounds
            act_denorm = lb + (ro.actions + 1.0) * (ub - lb) / 2
            act_clipped = np.array(
                [inner_env(env).limit_act(a) for a in act_denorm])
        else:
            act_denorm = ro.actions
            act_clipped = np.array([env.limit_act(a) for a in ro.actions])

        if dim_act == 1:
            axs[0, 0].plot(t, act_denorm, label="to env")
            axs[0, 0].plot(t, act_clipped, label="clipped", c="k", ls="--")
            axs[0, 0].legend(ncol=2)
            axs[0, 0].set_ylabel(_get_act_label(ro, 0))
        else:
            for idx_a in range(dim_act):
                axs[idx_a // num_cols,
                    idx_a % num_cols].plot(t,
                                           act_denorm[:, idx_a],
                                           label="to env",
                                           c=colors[idx_a])
                axs[idx_a // num_cols,
                    idx_a % num_cols].plot(t,
                                           act_clipped[:, idx_a],
                                           label="clipped",
                                           c="k",
                                           ls="--")
                axs[idx_a // num_cols, idx_a % num_cols].legend(ncol=2)
                axs[idx_a // num_cols,
                    idx_a % num_cols].set_ylabel(_get_act_label(ro, idx_a))

        # Put legends to the right of the plot
        if dim_act < 8:  # otherwise it gets too cluttered
            for a in fig.get_axes():
                a.legend(ncol=2)

        plt.subplots_adjust(hspace=0.2)
示例#27
0
def experiment_w_distruber(env_real: RealEnv, env_sim: SimEnv):
    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt('Running the evaluation policy ...', 'c', bright=True)
    ro1 = rollout(env_real,
                  policy,
                  eval=True,
                  max_steps=args.max_steps // 3,
                  render_mode=RenderMode(),
                  no_reset=True,
                  no_close=True)

    # Run disturber
    env_real = inner_env(env_real)  # since we are reusing it
    print_cbt('Running the 1st disturber ...', 'c', bright=True)
    rollout(env_real,
            disturber_pos,
            eval=True,
            max_steps=steps_disturb,
            render_mode=RenderMode(),
            no_reset=True,
            no_close=True)

    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt('Running the evaluation policy ...', 'c', bright=True)
    ro2 = rollout(env_real,
                  policy,
                  eval=True,
                  max_steps=args.max_steps // 3,
                  render_mode=RenderMode(),
                  no_reset=True,
                  no_close=True)

    # Run disturber
    env_real = inner_env(env_real)  # since we are reusing it
    print_cbt('Running the 2nd disturber ...', 'c', bright=True)
    rollout(env_real,
            disturber_neg,
            eval=True,
            max_steps=steps_disturb,
            render_mode=RenderMode(),
            no_reset=True,
            no_close=True)

    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt('Running the evaluation policy ...', 'c', bright=True)
    ro3 = rollout(env_real,
                  policy,
                  eval=True,
                  max_steps=args.max_steps // 3,
                  render_mode=RenderMode(),
                  no_reset=True,
                  no_close=True)

    return StepSequence.concat([ro1, ro2, ro3])
示例#28
0
    # Parse command line arguments
    parser = get_argparser()
    parser.add_argument("--render3D", action="store_true", default=False, help="render the GP in 3D")
    args = parser.parse_args()
    plt.rc("text", usetex=args.use_tex)

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment(hparam_list=args.show_hparams) if args.dir is None else args.dir

    env_sim = joblib.load(osp.join(ex_dir, "env_sim.pkl"))
    if not typed_env(env_sim, MetaDomainRandWrapper):
        raise pyrado.TypeErr(given_name=env_sim, expected_type=MetaDomainRandWrapper)
    labels_sel_dims = [env_sim.dp_mapping[args.idcs[i]][0] for i in range(len(args.idcs))]

    env_real = joblib.load(osp.join(ex_dir, "env_real.pkl"))
    if isinstance(inner_env(env_real), SimEnv):
        # Use actual ground truth domain param if sim-2-sim setting
        domain_params = env_real.domain_param
    else:
        # Use nominal domain param if sim-2-real setting
        domain_params = inner_env(env_sim).get_nominal_domain_param()
    for dp_name, dp_val in domain_params.items():
        if dp_name in labels_sel_dims[0]:
            gt_val_x = dp_val
        try:
            if dp_name == labels_sel_dims[1]:
                gt_val_y = dp_val
        except Exception:
            gt_val_y = None

    cands = pyrado.load("candidates.pt", ex_dir)
示例#29
0
from pyrado.environments.rcspysim.base import RcsSim
from pyrado.logger.experiment import ask_for_experiment
from pyrado.utils.argparser import get_argparser
from pyrado.utils.experiments import load_experiment
from pyrado.utils.input_output import print_cbt


if __name__ == '__main__':
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment() if args.ex_dir is None else args.ex_dir

    # Load the policy (trained in simulation)
    env, policy, _ = load_experiment(ex_dir)

    # Use torch.jit.trace / torch.jit.script (the latter if recurrent) to generate a torch.jit.ScriptModule
    ts_module = policy.script()  # can be evaluated like a regular PyTorch module

    # Serialize the script module to a file and save it in the same directory we loaded the policy from
    policy_export_file = osp.join(ex_dir, 'policy_export.pt')
    ts_module.save(policy_export_file)  # former .zip, and before that .pth
    print_cbt(f'Exported the loaded policy to\n{policy_export_file}', 'g', bright=True)

    # Export the experiment config for C++
    if isinstance(inner_env(env), RcsSim):
        exp_export_file = osp.join(ex_dir, f'ex_{env.name}_export.xml')
        inner_env(env).save_config_xml(exp_export_file)
        print_cbt(f'Exported experiment configuration to\n{exp_export_file}', 'g', bright=True)
    def __init__(
        self,
        env: Union[SimEnv, EnvWrapper],
        policy: Policy,
        num_init_states_per_domain: int,
        num_domains: int,
        num_workers: int,
        seed: Optional[int] = None,
    ):
        """
        Constructor

        :param env: environment to sample from
        :param policy: policy used for sampling
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param num_workers: number of parallel samplers
        :param seed: seed value for the random number generators, pass `None` for no seeding; defaults to the last seed
                     that was set with `pyrado.set_seed`
        """
        if not isinstance(num_init_states_per_domain, int):
            raise pyrado.TypeErr(given=num_init_states_per_domain,
                                 expected_type=int)
        if num_init_states_per_domain < 1:
            raise pyrado.ValueErr(given=num_init_states_per_domain,
                                  ge_constraint="1")
        if not isinstance(num_domains, int):
            raise pyrado.TypeErr(given=num_domains, expected_type=int)
        if num_domains < 1:
            raise pyrado.ValueErr(given=num_domains, ge_constraint="1")

        Serializable._init(self, locals())

        # Check environment for domain randomization wrappers (stops after finding the outermost)
        self._dr_wrapper = typed_env(env, DomainRandWrapper)
        if self._dr_wrapper is not None:
            assert isinstance(inner_env(env), SimEnv)
            # Remove them all from the env chain since we sample the domain parameter later explicitly
            env = remove_all_dr_wrappers(env)

        self.env, self.policy = env, policy
        self.num_init_states_per_domain = num_init_states_per_domain
        self.num_domains = num_domains

        # Set method to spawn if using cuda
        if mp.get_start_method(allow_none=True) != "spawn":
            mp.set_start_method("spawn", force=True)

        # Create parallel pool. We use one thread per environment because it's easier.
        self.pool = SamplerPool(num_workers)

        if seed is NO_SEED_PASSED:
            seed = pyrado.get_base_seed()
        self._seed = seed
        # Initialize with -1 such that we start with the 0-th sample. Incrementing after sampling may cause issues when
        # the sampling crashes and the sample count is not incremented.
        self._sample_count = -1

        # Distribute environments. We use pickle to make sure a copy is created for n_envs = 1
        self.pool.invoke_all(_pes_init, pickle.dumps(self.env),
                             pickle.dumps(self.policy))