def test_truncate_rollouts(env, num_real_ros, num_sim_ros, max_real_steps, max_sim_steps): policy = DummyPolicy(env.spec) ros_real = [] ros_sim = [] # Create the rollout data for _ in range(num_real_ros): ros_real.append( rollout(env, policy, eval=True, max_steps=max_real_steps, stop_on_done=True)) for _ in range(num_sim_ros): ros_sim.append( rollout(env, policy, eval=True, max_steps=max_sim_steps, stop_on_done=True)) # Truncate them ros_real_tr, ros_sim_tr = SysIdViaEpisodicRL.truncate_rollouts( ros_real, ros_sim) # Obtained the right number of rollouts assert len(ros_real_tr) == len(ros_sim_tr) for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr): # All individual truncated rollouts have the correct length assert ro_r.length == ro_s.length
def test_training_parameter_exploring(ex_dir, env: SimEnv, algo, algo_hparam): # Environment and policy env = ActNormWrapper(env) policy_hparam = dict(feats=FeatureStack([const_feat, identity_feat])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Get initial return for comparison rets_before = np.zeros(5) for i in range(rets_before.size): rets_before[i] = rollout(env, policy, eval=True, seed=i).undiscounted_return() # Create the algorithm and train algo_hparam['num_workers'] = 1 algo = algo(ex_dir, env, policy, **algo_hparam) algo.train() policy.param_values = algo.best_policy_param # mimic saving and loading # Compare returns before and after training for max_iter iteration rets_after = np.zeros_like(rets_before) for i in range(rets_before.size): rets_after[i] = rollout(env, policy, eval=True, seed=i).undiscounted_return() assert all(rets_after > rets_before)
def _eval_cand_and_ref_one_domain(self, i: int) -> tuple: """ Evaluate the candidate and the k-th reference solution (see outer loop) in the i-th domain using nJ rollouts. :param i: index of the domain to evaluate in :return: average return values for the candidate and the k-th reference in the i-th domain """ cand_ret_avg = 0.0 refs_ret_avg = 0.0 # Do nJ rollouts for each set of physics params for r in range(self.nJ): # Set the circular index for the particular realization self.env_dr.ring_idx = i # Do the rollout and collect the return ro_cand = rollout(self.env_dr, self._subrtn_cand.policy, eval=True, seed=self.base_seed + i * self.nJ + r) cand_ret_avg += ro_cand.undiscounted_return() # Set the circular index for the particular realization self.env_dr.ring_idx = i # Do the rollout and collect the return ro_ref = rollout(self.env_dr, self._subrtn_refs.policy, eval=True, seed=self.base_seed + i * self.nJ + r) refs_ret_avg += ro_ref.undiscounted_return() return cand_ret_avg / self.nJ, refs_ret_avg / self.nJ # average over nJ seeds
def create_qbb_setup(factor, dt, max_steps): # Set up environment init_state = np.array([0, 0, 0.1, 0.1, 0, 0, 0, 0]) env = QBallBalancerSim(dt=dt, max_steps=max_steps) env = ActNormWrapper(env) # Set up policy policy = QBallBalancerPDCtrl(env.spec) # Simulate ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=RenderMode(video=True), max_steps=max_steps, ) act_500Hz = ro.actions ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt * factor), init_state=init_state), render_mode=RenderMode(video=True), max_steps=int(max_steps / factor), ) act_100Hz = ro.actions env = DownsamplingWrapper(env, factor) ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=render_mode, max_steps=max_steps, ) act_500Hz_w = ro.actions # Time in seconds time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz))) time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz))) time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt), int(len(act_500Hz_w))) # Plot _, axs = plt.subplots(nrows=2) for i in range(2): axs[i].plot(time_500Hz, act_500Hz[:, i], label="500 Hz (original)") axs[i].plot(time_100Hz, act_100Hz[:, i], label="100 Hz", ls="--") axs[i].plot(time_500Hz_w, act_500Hz_w[:, i], label="500 Hz (wrapped)", ls="--") axs[i].legend() axs[i].set_ylabel(env.act_space.labels[i]) axs[1].set_xlabel("time [s]")
def create_qq_setup(factor, dt, max_steps, render_mode): # Set up environment init_state = np.array([0.1, 0.0, 0.0, 0.0]) env = QQubeSwingUpSim(dt=dt, max_steps=max_steps) env = ActNormWrapper(env) # Set up policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=render_mode, max_steps=max_steps, ) act_500Hz = ro.actions ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt * factor), init_state=init_state), render_mode=render_mode, max_steps=int(max_steps / factor), ) act_100Hz = ro.actions env = DownsamplingWrapper(env, factor) ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=render_mode, max_steps=max_steps, ) act_500Hz_w = ro.actions # Time in seconds time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz))) time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz))) time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt), int(len(act_500Hz_w))) # Plot _, ax = plt.subplots(nrows=1) ax.plot(time_500Hz, act_500Hz, label="500 Hz (original)") ax.plot(time_100Hz, act_100Hz, label="100 Hz", ls="--") ax.plot(time_500Hz_w, act_500Hz_w, label="500 Hz (wrapped)", ls="--") ax.legend() ax.set_ylabel(env.act_space.labels) ax.set_xlabel("time [s]")
def create_qq_setup(factor, dt, max_steps): # Set up environment init_state = np.array([0.1, 0.0, 0.0, 0.0]) env = QQubeSwingUpSim(dt=dt, max_steps=max_steps) env = ActNormWrapper(env) # Set up policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=RenderMode(video=True), max_steps=max_steps, ) act_500Hz = ro.actions ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt * factor), init_state=init_state), render_mode=RenderMode(video=True), max_steps=int(max_steps / factor), ) act_100Hz = ro.actions act_100Hz_zoh = np.repeat(act_100Hz, 5, axis=0) env = DownsamplingWrapper(env, factor) ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=RenderMode(video=True), max_steps=max_steps, ) act_500Hz_wrapped = ro.actions # Plot _, ax = plt.subplots(nrows=1) ax.plot(act_500Hz, label="500 Hz (original)") ax.plot(act_100Hz_zoh, label="100 Hz (zoh)") ax.plot(act_500Hz_wrapped, label="500 Hz (wrapped)") ax.legend() ax.set_ylabel(env.act_space.labels) ax.set_xlabel("time steps") plt.show()
def test_velocity_filter(plot: bool): # Set up environment env_gt = QQubeSwingUpSim(dt=1 / 500.0, max_steps=350) env_gt.init_space = SingularStateSpace(np.array([0.1, np.pi / 2, 3.0, 0])) env_filt = ObsVelFiltWrapper(env_gt, idcs_pos=["theta", "alpha"], idcs_vel=["theta_dot", "alpha_dot"]) # Set up policy policy = IdlePolicy(env_gt.spec) # Simulate ro_gt = rollout(env_gt, policy) ro_filt = rollout(env_filt, policy) # Filter the observations of the last rollout theta_dot_gt = ro_gt.observations[:, 4] alpha_dot_gt = ro_gt.observations[:, 5] theta_dot_filt = ro_filt.observations[:, 4] alpha_dot_filt = ro_filt.observations[:, 5] assert theta_dot_filt[0] != pytest.approx( theta_dot_gt[0]) # can't be equal since we set an init vel of 3 rad/s assert alpha_dot_filt[0] == pytest.approx(alpha_dot_gt[0], abs=1e-4) # Compute the error rmse_theta = rmse(theta_dot_gt, theta_dot_filt) rmse_alpha = rmse(alpha_dot_gt, alpha_dot_filt) if plot: from matplotlib import pyplot as plt # Plot the filtered signals versus the orignal observations plt.rc("text", usetex=True) fix, axs = plt.subplots(2, figsize=(16, 9)) axs[0].plot(theta_dot_gt, label=r"$\dot{\theta}_{true}$") axs[0].plot(theta_dot_filt, label=r"$\dot{\theta}_{filt}$") axs[1].plot(alpha_dot_gt, label=r"$\dot{\alpha}_{true}$") axs[1].plot(alpha_dot_filt, label=r"$\dot{\alpha}_{filt}$") axs[0].set_title(rf"RMSE($\theta$): {rmse_theta}") axs[0].set_ylabel(r"$\dot{\theta}$ [rad/s]") axs[0].legend() axs[1].set_title(rf"RMSE($\alpha$): {rmse_alpha}") axs[1].set_xlabel("time steps") axs[1].set_ylabel(r"$\dot{\alpha}$ [rad/s]") axs[1].legend() plt.show()
def create_manual_activation_setup(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkTASim(physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, positionTasks=True, observeTaskSpaceDiscrepancy=True) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): pot = np.fromstring(input("Enter potentials for next step: "), dtype=np.double, count=3, sep=' ') return 1 / (1 + np.exp(-pot)) policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def rollout_dummy_rbf_policy_4dof(): # Environment env = WAMBallInCupSim( num_dof=4, max_steps=3000, # Note, when tuning the task args: the `R` matrices are now 4x4 for the 4 dof WAM task_args=dict(R=np.zeros((4, 4)), R_dev=np.diag([0.2, 0.2, 1e-2, 1e-2])), ) # Stabilize ball and print out the stable state env.reset() act = np.zeros(env.spec.act_space.flat_dim) for i in range(1500): env.step(act) env.render(mode=RenderMode(video=True)) # Printing out actual positions for 4-dof (..just needed to setup the hard-coded values in the class) print('Ball pos:', env.sim.data.get_body_xpos('ball')) print('Cup goal:', env.sim.data.get_site_xpos('cup_goal')) print('Joint pos (incl. first rope angle):', env.sim.data.qpos[:5]) # Apply DualRBFLinearPolicy and plot the joint states over the desired ones rbf_hparam = dict(num_feat_per_dim=7, bounds=(np.array([0.]), np.array([1.]))) policy = DualRBFLinearPolicy(env.spec, rbf_hparam, dim_mask=2) done, param = False, None while not done: ro = rollout(env, policy, render_mode=RenderMode(video=True), eval=True, reset_kwargs=dict(domain_param=param)) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, _, param = after_rollout_query(env, policy, ro)
def eval_damping(): """ Plot joint trajectories for different joint damping parameters """ # Load experiment and remove possible randomization wrappers ex_dir = ask_for_experiment() env, policy, _ = load_experiment(ex_dir) env = inner_env(env) env.domain_param = WAMBallInCupSim.get_nominal_domain_param() data = [] t = [] dampings = [0., 1e-2, 1e-1, 1e0] print_cbt(f'Run policy for damping coefficients: {dampings}') for d in dampings: env.reset(domain_param=dict(joint_damping=d)) ro = rollout(env, policy, render_mode=RenderMode(video=False), eval=True) t.append(ro.env_infos['t']) data.append(ro.env_infos['qpos']) fig, ax = plt.subplots(3, sharex='all') ls = ['k-', 'b--', 'g-.', 'r:'] # line style setting for better visibility for i, idx in enumerate([1, 3, 5]): for j in range(len(dampings)): ax[i].plot(t[j], data[j][:, idx], ls[j], label=f'damping: {dampings[j]}') if i == 0: ax[i].legend() ax[i].set_ylabel(f'joint {idx} pos [rad]') ax[2].set_xlabel('time [s]') plt.suptitle('Evaluation of joint damping coefficient') plt.show()
def create_joint_control_setup(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkJointCtrlSim( physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, taskCombinationMethod="sum", checkJointLimits=True, ) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [ 10 / 180 * math.pi, 10 / 180 * math.pi, # same as init config 10 / 180 * math.pi + 45.0 / 180.0 * math.pi * math.sin(2.0 * math.pi * 0.2 * t), ] # oscillation in last link policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def _ps_sample_one(G): """ Sample one rollout and return step count if counting steps, rollout count (1) otherwise. This function is used when a minimum number of steps was given. """ ro = rollout(G.env, G.policy, bernoulli_reset=G.bernoulli_reset) return ro, len(ro)
def _handle_neg_samples(self, cand_rets: np.ndarray, refs_rets: np.ndarray, k: int, i: int) -> np.ndarray: """ Process negative optimality gap samples by Looking at the other Reference Solutions :param cand_rets: array of the candidate's return values :param refs_rets: array of the references' return values :param k: index of the reference solution :param i: index of the domain :return refs_rets: if a better reference has been round the associated value will be overwritten """ if refs_rets[k, i] < cand_rets[k, i]: print_cbt( f'\nReference {k + 1} is worse than the candidate on domain realization {i + 1}.\n' # 1-based index 'Trying to replace this reference at this realization with a different one', 'y') for other_k in range(self.nG): if other_k == k: # Do nothing for the bad solution that brought us here continue else: # Load a reference solution different from the the k-th other_ref = pyrado.load( self._subrtn_refs._policy, 'policy', 'pt', self.save_dir, dict(prefix=f'iter_{self._curr_iter}', suffix=f'ref_{other_k}')) other_ref_ret = 0 for r in range(self.nJ): # Set the same random seed pyrado.set_seed(self.base_seed + i * self.nJ + r) # Set the circular index for the particular realization self.env_dr.ring_idx = i # Do the rollout and collect the return ro_other_ref = rollout(self.env_dr, other_ref, eval=True) other_ref_ret += ro_other_ref.undiscounted_return( ) / self.nJ # average over nJ seeds # Store the value if value is better if other_ref_ret > refs_rets[k, i]: refs_rets[k, i] = other_ref_ret # If a better one was found, do not iterate over the remaining reference solutions break if refs_rets[k, i] > cand_rets[k, i]: # Found a different reference that achieves a higher return that the candidate print_cbt('Successfully handled a negative OG sample', 'g') else: refs_rets[k, i] = cand_rets[ k, i] # forces optimality gap sample to be 0 print_cbt( 'Unsuccessfully handled a negative OG sample: Set the value to 0', 'r') else: # Everything is as it should be pass return refs_rets
def create_ik_activation_setup(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkIKActivationSim( physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, taskCombinationMethod='product', positionTasks=True, checkJointLimits=False, collisionAvoidanceIK=True, observeTaskSpaceDiscrepancy=True, ) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [0.3 + 0.2 * math.sin(2. * math.pi * 0.2 * t), 0, 1] policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def joint_control_variant(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkJointCtrlSim( physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, checkJointLimits=True, ) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [ 0.1, 0.1, # same as init config 0.1 + 45. / 180. * math.pi * math.sin(2. * math.pi * 0.2 * t) ] # oscillation in last link policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def generate_oscillation_data(dt, t_end, excitation): """ Use OMOEnv to generate a 1-dim damped oscillation signal. :param dt: time step size [s] :param t_end: Time duration [s] :param excitation: type of excitation, either (initial) 'position' or 'force' (function of time) :return: 1-dim oscillation trajectory """ env = OneMassOscillatorSim(dt, np.ceil(t_end / dt)) env.domain_param = dict(m=1., k=10., d=2.0) if excitation == 'force': policy = TimePolicy( env.spec, functools.partial(_dirac_impulse, env_spec=env.spec, amp=0.5), dt) reset_kwargs = dict(init_state=np.array([0, 0])) elif excitation == 'position': policy = IdlePolicy(env.spec) reset_kwargs = dict(init_state=np.array([0.5, 0])) else: raise pyrado.ValueErr(given=excitation, eq_constraint="'force' or 'position'") # Generate the data ro = rollout(env, policy, reset_kwargs=reset_kwargs, record_dts=False) return ro.observations[:, 0]
def task_activation_variant(dt, max_steps, max_dist_force, physics_engine, graph_file_name): # Set up environment env = PlanarInsertTASim( physicsEngine=physics_engine, graphFileName=graph_file_name, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, taskCombinationMethod='sum', # 'sum', 'mean', 'product', or 'softmax' checkJointLimits=False, collisionAvoidanceIK=True, observeForceTorque=True, observePredictedCollisionCost=False, observeManipulabilityIndex=False, observeCurrentManipulability=True, observeGoalDistance=False, observeDynamicalSystemDiscrepancy=False, observeTaskSpaceDiscrepancy=True, ) env.reset(domain_param=dict(effector_friction=1.)) # Set up policy def policy_fcn(t: float): return [0.7, 1, 0, 0.1, 0.5, 0.5] policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate and plot potentials print(env.obs_space.labels) return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=False)
def _ps_sample_one(G, eval: bool): """ Sample one rollout and return step count if counting steps, rollout count (1) otherwise. This function is used when a minimum number of steps was given. """ ro = rollout(G.env, G.policy, eval=eval) return ro, len(ro)
def task_activation_variant(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkTASim( physicsEngine=physics_engine, dt=dt, position_mps=True, max_steps=max_steps, max_dist_force=max_dist_force, checkJointLimits=False, collisionAvoidanceIK=True, observeCollisionCost=True, observeDynamicalSystemDiscrepancy=False, ) print(env.obs_space.labels) # Set up policy def policy_fcn(t: float): if t < 3: return [0, 1, 0] elif t < 7: return [1, 0, 0] elif t < 10: return [.5, 0.5, 0] else: return [0, 0, 1] policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=False)
def eval_ddp_policy(rollouts_real): init_states_real = np.array([ro.states[0, :] for ro in rollouts_real]) rollouts_sim = [] for i, _ in enumerate(range(num_eval_rollouts)): rollouts_sim.append( rollout(env_sim, behavior_policy, eval=True, reset_kwargs=dict(init_state=init_states_real[i, :]))) # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real, rollouts_sim, replicate=False) assert len(ros_real_tr) == len(ros_sim_tr) assert all([ np.allclose(r.states[0, :], s.states[0, :]) for r, s in zip(ros_real_tr, ros_sim_tr) ]) # Return the average the loss losses = [ algo.loss_fcn(ro_r, ro_s) for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr) ] return float(np.mean(np.asarray(losses)))
def eval_policy(save_dir: [str, None], env: [RealEnv, SimEnv, MetaDomainRandWrapper], policy: Policy, mc_estimator: bool, prefix: str, num_rollouts: int, num_parallel_envs: int = 1) -> to.Tensor: """ Evaluate a policy on the target system (real-world platform). This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence bound (`False`) obtained from bootrapping :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True) rets_real = to.zeros(num_rollouts) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when conducting a sim-to-real experiment for i in range(num_rollouts): rets_real[i] = rollout(env, policy, eval=True).undiscounted_return() # If a reward of -1 is given, skip evaluation ahead and set all returns to zero if rets_real[i] == -1: print_cbt('Set all returns for this policy to zero.', color='c') rets_real = to.zeros(num_rollouts) break elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when conducting a sim-to-sim experiment sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts) ros = sampler.sample() for i in range(num_rollouts): rets_real[i] = ros[i].undiscounted_return() else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) if save_dir is not None: # Save the evaluation results to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt')) print_cbt('Target domain performance', bright=True) print(tabulate([['mean return', to.mean(rets_real).item()], ['std return', to.std(rets_real)], ['min return', to.min(rets_real)], ['max return', to.max(rets_real)]])) if mc_estimator: return to.mean(rets_real) else: return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean, num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
def test_rollout_wo_policy(env: SimEnv): def policy(obs): # Callable must receive and return tensors return to.from_numpy(env.spec.act_space.sample_uniform()) ro = rollout(env, policy, render_mode=RenderMode()) assert isinstance(ro, StepSequence) assert len(ro) <= env.max_steps
def test_dualrbf_policy(env, dim_mask): # Hyper-parameters for the RBF features are not important here rbf_hparam = dict(num_feat_per_dim=7, bounds=(np.array([0.]), np.array([1.])), scale=None) policy = DualRBFLinearPolicy(env.spec, rbf_hparam, dim_mask) assert policy.num_param == policy.num_active_feat*env.act_space.flat_dim//2 ro = rollout(env, policy, eval=True) assert ro is not None
def eval_policy( save_dir: Optional[pyrado.PathLike], env: Env, policy: Policy, prefix: str, num_rollouts: int, num_workers: int = 1, ) -> to.Tensor: """ Evaluate a policy either in the source or in the target domain. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_workers: number of environments for the parallel sampler (only used for SimEnv) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f"Executing {prefix}_policy ...", "c", bright=True) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when evaluating on a real-world device rets_real = [] for i in range(num_rollouts): rets_real.append( rollout(env, policy, eval=True).undiscounted_return()) elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when evaluating in a simulation sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=num_rollouts) ros = sampler.sample(eval=True) rets_real = [ro.undiscounted_return() for ro in ros] else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype()) if save_dir is not None: # Save and print the evaluation results pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix) print_cbt("Target domain performance", bright=True) print( tabulate([ ["mean return", to.mean(rets_real).item()], ["std return", to.std(rets_real)], ["min return", to.min(rets_real)], ["max return", to.max(rets_real)], ])) return to.mean(rets_real)
def _run_rollout_dp(G, domain_param, init_state=None): return rollout( G.env, G.policy, eval=True, # render_mode=RenderMode(video=True), reset_kwargs={ 'domain_param': domain_param, 'init_state': init_state })
def _simulator(dp: to.Tensor) -> to.Tensor: """The most simple interface of a simulation to sbi, using `env` and `policy` from outer scope""" ro = rollout( env, policy, eval=True, reset_kwargs=dict(domain_param=dict(m=dp[0], k=dp[1], d=dp[2]))) observation_sim = to.from_numpy( ro.observations[-1]).to(dtype=to.float32) return to.atleast_2d(observation_sim)
def _ps_run_one(G, num: int, eval: bool, seed: int, sub_seed: int): """ Sample one rollout without specifying the initial state or the domain parameters. This function is used when a minimum number of rollouts was given. """ return rollout(G.env, G.policy, eval=eval, seed=seed, sub_seed=sub_seed, sub_sub_seed=num)
def _pes_sample_one(G, param): """ Sample one rollout with the current setting. """ pol_param, dom_param, init_state = param vector_to_parameters(pol_param, G.policy.parameters()) return rollout(G.env, G.policy, reset_kwargs={ 'init_state': init_state, 'domain_param': dom_param, })
def test_action_statistics(env: SimEnv, policy: Policy): sigma = 1.0 # with lower values like 0.1 we can observe violations of the tolerances # Create an action-based exploration strategy explstrat = NormalActNoiseExplStrat(policy, std_init=sigma) # Sample a deterministic rollout ro_policy = rollout(env, policy, eval=True, max_steps=1000, stop_on_done=False, seed=0) ro_policy.torch(to.get_default_dtype()) # Run the exploration strategy on the previously sampled rollout if policy.is_recurrent: if isinstance(policy, TwoHeadedPolicy): act_expl, _, _ = explstrat(ro_policy.observations) else: act_expl, _ = explstrat(ro_policy.observations) # Get the hidden states from the deterministic rollout hidden_states = ro_policy.hidden_states else: if isinstance(policy, TwoHeadedPolicy): act_expl, _ = explstrat(ro_policy.observations) else: act_expl = explstrat(ro_policy.observations) hidden_states = [ 0.0 ] * ro_policy.length # just something that does not violate the format ro_expl = StepSequence( actions=act_expl[:-1], # truncate act due to last obs observations=ro_policy.observations, rewards=ro_policy.rewards, # don't care but necessary hidden_states=hidden_states, ) ro_expl.torch() # Compute action statistics and the ground truth actstats = compute_action_statistics(ro_expl, explstrat) gt_logprobs = Normal(loc=ro_policy.actions, scale=sigma).log_prob(ro_expl.actions) gt_entropy = Normal(loc=ro_policy.actions, scale=sigma).entropy() to.testing.assert_allclose(actstats.log_probs, gt_logprobs, rtol=1e-4, atol=1e-5) to.testing.assert_allclose(actstats.entropy, gt_entropy, rtol=1e-4, atol=1e-5)
def simulator(mu): # In the end, the output of this could be a distance measure over trajectories instead of just the final state ro = rollout( env, policy, eval=True, reset_kwargs=dict( # domain_param=dict(k=mu[0], d=mu[1]), init_state=np.array([-0.7, 0.]) # no variance over the init state domain_param=dict(k=mu[0], d=mu[1]) # no variance over the parameters )) return to.from_numpy(ro.observations[-1]).to(dtype=to.float32)