def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, pop_size: [int, None] = None, num_workers: int = 4, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy parameter set :param pop_size: number of solutions in the population, pass `None` to use a default that scales logarithmically with the number of policy parameters :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not (isinstance(pop_size, int) or pop_size is None): raise pyrado.TypeErr(given=pop_size, expected_type=int) if isinstance(pop_size, int) and pop_size <= 0: raise pyrado.ValueErr(given=pop_size, g_constraint='0') # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Store the inputs self._env = env self.num_rollouts = num_rollouts # Auto-select population size if needed if pop_size is None: pop_size = 4 + int(3 * np.log(policy.num_param)) print_cbt(f'Initialized population size to {pop_size}.', 'y') self.pop_size = pop_size # Create sampler self.sampler = ParameterExplorationSampler( env, policy, num_workers=num_workers, num_rollouts_per_param=num_rollouts, ) # Stopping criterion self.ret_avg_stack = 1e3 * np.random.randn(20) # stack size = 20 self.thold_ret_std = 1e-1 # algorithm terminates if below for multiple iterations # Saving the best policy (this is not the mean for policy parameter exploration) self.best_policy_param = policy.param_values.clone() # Set this in subclasses self._expl_strat = None
def test_param_expl_sampler( env: SimEnv, policy: Policy, num_init_states_per_domain: int, fixed_init_state: bool, num_domains: int, num_workers: int, ): pyrado.set_seed(0) # Add randomizer pert = create_default_randomizer(env) env = DomainRandWrapperLive(env, pert) # Create the sampler sampler = ParameterExplorationSampler(env, policy, num_init_states_per_domain, num_domains, num_workers=num_workers) # Use some random parameters num_ps = 7 params = to.rand(num_ps, policy.num_param) if fixed_init_state: # Sample a custom init state init_states = [env.init_space.sample_uniform() ] * num_init_states_per_domain else: # Let the sampler forward to the env to randomly sample an init state init_states = None # Do the sampling samples = sampler.sample(param_sets=params, init_states=init_states) # Check if the correct number of rollouts has been sampled assert num_ps == len(samples) num_rollouts_per_param = num_init_states_per_domain * num_domains assert num_ps * num_rollouts_per_param == samples.num_rollouts for ps in samples: assert len(ps.rollouts) == num_rollouts_per_param # Compare rollouts that should be matching for idx in range(num_rollouts_per_param): # Use the first parameter set as pivot piter = iter(samples) pivot = next(piter).rollouts[idx] # Iterate through others for ops in piter: other_ro = ops.rollouts[idx] # Compare domain params assert pivot.rollout_info["domain_param"] == other_ro.rollout_info[ "domain_param"] # Compare first observation a.k.a. init state assert pivot[0].observation == pytest.approx( other_ro[0].observation)
def test_parameter_exploration_sampler(env: SimEnv, policy: Policy, num_workers: int): # Use some random parameters num_ps = 7 params = to.rand(num_ps, policy.num_param) sampler = ParameterExplorationSampler(env, policy, num_init_states_per_domain=1, num_domains=1, num_workers=num_workers) psr = sampler.sample(param_sets=params) assert isinstance(psr, ParameterSamplingResult) assert len(psr.rollouts) >= 1 * 1 * num_ps
def test_param_expl_sampler(default_bob, bob_pert): # Add randomizer env = DomainRandWrapperLive(default_bob, bob_pert) # Use a simple policy policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh) # Create the sampler num_rollouts_per_param = 12 sampler = ParameterExplorationSampler( env, policy, num_envs=1, num_rollouts_per_param=num_rollouts_per_param, ) # Use some random parameters num_ps = 12 params = to.rand(num_ps, policy.num_param) # Do the sampling samples = sampler.sample(params) assert num_ps == len(samples) for ps in samples: assert len(ps.rollouts) == num_rollouts_per_param # Compare rollouts that should be matching for ri in range(num_rollouts_per_param): # Use the first paramset as pivot piter = iter(samples) pivot = next(piter).rollouts[ri] # Iterate through others for ops in piter: ro = ops.rollouts[ri] # Compare domain params assert pivot.rollout_info['domain_param'] == ro.rollout_info['domain_param'] # Compare first observation a.k.a. init state assert pivot[0].observation == pytest.approx(ro[0].observation)
def test_param_expl_sampler(env: SimEnv, policy: Policy): # Add randomizer pert = create_default_randomizer(env) env = DomainRandWrapperLive(env, pert) # Create the sampler num_rollouts_per_param = 12 sampler = ParameterExplorationSampler( env, policy, num_workers=1, num_rollouts_per_param=num_rollouts_per_param) # Use some random parameters num_ps = 12 params = to.rand(num_ps, policy.num_param) # Do the sampling samples = sampler.sample(params) assert num_ps == len(samples) for ps in samples: assert len(ps.rollouts) == num_rollouts_per_param # Compare rollouts that should be matching for ri in range(num_rollouts_per_param): # Use the first paramset as pivot piter = iter(samples) pivot = next(piter).rollouts[ri] # Iterate through others for ops in piter: ro = ops.rollouts[ri] # Compare domain params assert pivot.rollout_info['domain_param'] == ro.rollout_info[ 'domain_param'] # Compare first observation a.k.a. init state assert pivot[0].observation == pytest.approx(ro[0].observation)
def test_parameter_exploration_sampler_deterministic( env: SimEnv, policy: Policy, num_params: int, num_init_states_per_domain: int, num_domains: int, set_init_states: bool, ): param_sets = to.rand(num_params, policy.num_param) if set_init_states: init_states = [ env.spec.state_space.sample_uniform() for _ in range(num_init_states_per_domain * num_domains) ] else: init_states = None nums_workers = (1, 2, 4) all_results = [] for num_workers in nums_workers: # Reset the seed every time because sample() uses the root sampler. This does not matter for regular runs, but # for this tests it is very relevant. pyrado.set_seed(0) all_results.append( ParameterExplorationSampler( env, policy, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, num_workers=num_workers, seed=0, ).sample(param_sets=param_sets, init_states=init_states)) # Test that the rollouts for all number of workers are equal. for psr_a, psr_b in [(a, b) for a in all_results for b in all_results]: assert psr_a.parameters == pytest.approx(psr_b.parameters) assert psr_a.mean_returns == pytest.approx(psr_b.mean_returns) assert psr_a.num_rollouts == psr_b.num_rollouts assert len(psr_a.rollouts) == len(psr_b.rollouts) for ros_a, ros_b in zip(psr_a.rollouts, psr_b.rollouts): for ro_a, ro_b in zip(ros_a, ros_b): assert ro_a.rewards == pytest.approx(ro_b.rewards) assert ro_a.observations == pytest.approx(ro_b.observations) assert ro_a.actions == pytest.approx(ro_b.actions)
class ParameterExploring(Algorithm): """ Base for all algorithms that explore directly in the policy parameter space """ def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, pop_size: [int, None] = None, num_workers: int = 4, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy parameter set :param pop_size: number of solutions in the population, pass `None` to use a default that scales logarithmically with the number of policy parameters :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not (isinstance(pop_size, int) or pop_size is None): raise pyrado.TypeErr(given=pop_size, expected_type=int) if isinstance(pop_size, int) and pop_size <= 0: raise pyrado.ValueErr(given=pop_size, g_constraint='0') # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Store the inputs self._env = env self.num_rollouts = num_rollouts # Auto-select population size if needed if pop_size is None: pop_size = 4 + int(3 * np.log(policy.num_param)) print_cbt(f'Initialized population size to {pop_size}.', 'y') self.pop_size = pop_size # Create sampler self.sampler = ParameterExplorationSampler( env, policy, num_workers=num_workers, num_rollouts_per_param=num_rollouts, ) # Stopping criterion self.ret_avg_stack = 1e3 * np.random.randn(20) # stack size = 20 self.thold_ret_std = 1e-1 # algorithm terminates if below for multiple iterations # Saving the best policy (this is not the mean for policy parameter exploration) self.best_policy_param = policy.param_values.clone() # Set this in subclasses self._expl_strat = None @property def env(self) -> Env: """ Get the environment in which the algorithm exploration trains. """ return self._env @property def expl_strat(self) -> StochasticParamExplStrat: return self._expl_strat def stopping_criterion_met(self) -> bool: """ Check if the average reward of the mean policy did not change more than the specified threshold over the last iterations. """ if np.std(self.ret_avg_stack) < self.thold_ret_std: return True else: return False def reset(self, seed: int = None): # Reset the exploration strategy, internal variables and the random seeds super().reset(seed) def step(self, snapshot_mode: str, meta_info: dict = None): # Sample new policy parameters param_sets = self._expl_strat.sample_param_sets( self._policy.param_values, self.pop_size, # If you do not want to include the current policy parameters, be aware that you also have to do follow-up # changes in the update() functions in all subclasses of ParameterExploring include_nominal_params=True) with to.no_grad(): # Sample rollouts using these parameters param_samp_res = self.sampler.sample(param_sets) # Evaluate the current policy (first one in list if include_nominal_params is True) ret_avg_curr = param_samp_res[0].mean_undiscounted_return # Store the average return for the stopping criterion self.ret_avg_stack = np.delete(self.ret_avg_stack, 0) self.ret_avg_stack = np.append(self.ret_avg_stack, ret_avg_curr) all_rets = param_samp_res.mean_returns all_lengths = np.array( [len(ro) for pss in param_samp_res for ro in pss.rollouts]) # Log metrics computed from the old policy (before the update) self._cnt_samples += int(np.sum(all_lengths)) self.logger.add_value('curr policy return', ret_avg_curr, 4) self.logger.add_value('max return', np.max(all_rets), 4) self.logger.add_value('median return', np.median(all_rets), 4) self.logger.add_value('min return', np.min(all_rets), 4) self.logger.add_value('avg return', np.mean(all_rets), 4) self.logger.add_value('std return', np.std(all_rets), 4) self.logger.add_value('avg rollout len', np.mean(all_lengths), 4) self.logger.add_value('num total samples', self._cnt_samples) self.logger.add_value( 'min mag policy param', self._policy.param_values[to.argmin( abs(self._policy.param_values))]) self.logger.add_value( 'max mag policy param', self._policy.param_values[to.argmax( abs(self._policy.param_values))]) # Extract the best policy parameter sample for saving it later self.best_policy_param = param_samp_res.parameters[np.argmax( param_samp_res.mean_returns)].clone() # Update the policy self.update(param_samp_res, ret_avg_curr) # Save snapshot data self.make_snapshot(snapshot_mode, float(np.max(param_samp_res.mean_returns)), meta_info) @abstractmethod def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float): """ Update the policy from the given samples. :param param_results: Sampled parameters with evaluation :param ret_avg_curr: Average return for the current parameters """ raise NotImplementedError def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) # Save the best element of the current population best_policy = deepcopy(self._policy) best_policy.param_values = self.best_policy_param pyrado.save(best_policy, 'policy', 'pt', self.save_dir, meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)