def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 pop_size: [int, None] = None,
                 num_workers: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy parameter set
        :param pop_size: number of solutions in the population, pass `None` to use a default that scales logarithmically
                         with the number of policy parameters
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not (isinstance(pop_size, int) or pop_size is None):
            raise pyrado.TypeErr(given=pop_size, expected_type=int)
        if isinstance(pop_size, int) and pop_size <= 0:
            raise pyrado.ValueErr(given=pop_size, g_constraint='0')

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.num_rollouts = num_rollouts

        # Auto-select population size if needed
        if pop_size is None:
            pop_size = 4 + int(3 * np.log(policy.num_param))
            print_cbt(f'Initialized population size to {pop_size}.', 'y')
        self.pop_size = pop_size

        # Create sampler
        self.sampler = ParameterExplorationSampler(
            env,
            policy,
            num_workers=num_workers,
            num_rollouts_per_param=num_rollouts,
        )

        # Stopping criterion
        self.ret_avg_stack = 1e3 * np.random.randn(20)  # stack size = 20
        self.thold_ret_std = 1e-1  # algorithm terminates if below for multiple iterations

        # Saving the best policy (this is not the mean for policy parameter exploration)
        self.best_policy_param = policy.param_values.clone()

        # Set this in subclasses
        self._expl_strat = None
示例#2
0
def test_param_expl_sampler(
    env: SimEnv,
    policy: Policy,
    num_init_states_per_domain: int,
    fixed_init_state: bool,
    num_domains: int,
    num_workers: int,
):
    pyrado.set_seed(0)

    # Add randomizer
    pert = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, pert)

    # Create the sampler
    sampler = ParameterExplorationSampler(env,
                                          policy,
                                          num_init_states_per_domain,
                                          num_domains,
                                          num_workers=num_workers)

    # Use some random parameters
    num_ps = 7
    params = to.rand(num_ps, policy.num_param)

    if fixed_init_state:
        # Sample a custom init state
        init_states = [env.init_space.sample_uniform()
                       ] * num_init_states_per_domain
    else:
        # Let the sampler forward to the env to randomly sample an init state
        init_states = None

    # Do the sampling
    samples = sampler.sample(param_sets=params, init_states=init_states)

    # Check if the correct number of rollouts has been sampled
    assert num_ps == len(samples)
    num_rollouts_per_param = num_init_states_per_domain * num_domains
    assert num_ps * num_rollouts_per_param == samples.num_rollouts
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for idx in range(num_rollouts_per_param):
        # Use the first parameter set as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[idx]

        # Iterate through others
        for ops in piter:
            other_ro = ops.rollouts[idx]
            # Compare domain params
            assert pivot.rollout_info["domain_param"] == other_ro.rollout_info[
                "domain_param"]
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(
                other_ro[0].observation)
示例#3
0
def test_parameter_exploration_sampler(env: SimEnv, policy: Policy,
                                       num_workers: int):
    # Use some random parameters
    num_ps = 7
    params = to.rand(num_ps, policy.num_param)

    sampler = ParameterExplorationSampler(env,
                                          policy,
                                          num_init_states_per_domain=1,
                                          num_domains=1,
                                          num_workers=num_workers)
    psr = sampler.sample(param_sets=params)
    assert isinstance(psr, ParameterSamplingResult)
    assert len(psr.rollouts) >= 1 * 1 * num_ps
示例#4
0
def test_param_expl_sampler(default_bob, bob_pert):
    # Add randomizer
    env = DomainRandWrapperLive(default_bob, bob_pert)

    # Use a simple policy
    policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh)

    # Create the sampler
    num_rollouts_per_param = 12
    sampler = ParameterExplorationSampler(
        env,
        policy,
        num_envs=1,
        num_rollouts_per_param=num_rollouts_per_param,
    )

    # Use some random parameters
    num_ps = 12
    params = to.rand(num_ps, policy.num_param)

    # Do the sampling
    samples = sampler.sample(params)

    assert num_ps == len(samples)
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for ri in range(num_rollouts_per_param):
        # Use the first paramset as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[ri]
        # Iterate through others
        for ops in piter:
            ro = ops.rollouts[ri]

            # Compare domain params
            assert pivot.rollout_info['domain_param'] == ro.rollout_info['domain_param']
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(ro[0].observation)
示例#5
0
def test_param_expl_sampler(env: SimEnv, policy: Policy):
    # Add randomizer
    pert = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, pert)

    # Create the sampler
    num_rollouts_per_param = 12
    sampler = ParameterExplorationSampler(
        env,
        policy,
        num_workers=1,
        num_rollouts_per_param=num_rollouts_per_param)

    # Use some random parameters
    num_ps = 12
    params = to.rand(num_ps, policy.num_param)

    # Do the sampling
    samples = sampler.sample(params)

    assert num_ps == len(samples)
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for ri in range(num_rollouts_per_param):
        # Use the first paramset as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[ri]
        # Iterate through others
        for ops in piter:
            ro = ops.rollouts[ri]

            # Compare domain params
            assert pivot.rollout_info['domain_param'] == ro.rollout_info[
                'domain_param']
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(ro[0].observation)
示例#6
0
def test_parameter_exploration_sampler_deterministic(
    env: SimEnv,
    policy: Policy,
    num_params: int,
    num_init_states_per_domain: int,
    num_domains: int,
    set_init_states: bool,
):
    param_sets = to.rand(num_params, policy.num_param)

    if set_init_states:
        init_states = [
            env.spec.state_space.sample_uniform()
            for _ in range(num_init_states_per_domain * num_domains)
        ]
    else:
        init_states = None

    nums_workers = (1, 2, 4)

    all_results = []
    for num_workers in nums_workers:
        # Reset the seed every time because sample() uses the root sampler. This does not matter for regular runs, but
        # for this tests it is very relevant.
        pyrado.set_seed(0)
        all_results.append(
            ParameterExplorationSampler(
                env,
                policy,
                num_init_states_per_domain=num_init_states_per_domain,
                num_domains=num_domains,
                num_workers=num_workers,
                seed=0,
            ).sample(param_sets=param_sets, init_states=init_states))

    # Test that the rollouts for all number of workers are equal.
    for psr_a, psr_b in [(a, b) for a in all_results for b in all_results]:
        assert psr_a.parameters == pytest.approx(psr_b.parameters)
        assert psr_a.mean_returns == pytest.approx(psr_b.mean_returns)
        assert psr_a.num_rollouts == psr_b.num_rollouts
        assert len(psr_a.rollouts) == len(psr_b.rollouts)
        for ros_a, ros_b in zip(psr_a.rollouts, psr_b.rollouts):
            for ro_a, ro_b in zip(ros_a, ros_b):
                assert ro_a.rewards == pytest.approx(ro_b.rewards)
                assert ro_a.observations == pytest.approx(ro_b.observations)
                assert ro_a.actions == pytest.approx(ro_b.actions)
class ParameterExploring(Algorithm):
    """ Base for all algorithms that explore directly in the policy parameter space """
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 pop_size: [int, None] = None,
                 num_workers: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy parameter set
        :param pop_size: number of solutions in the population, pass `None` to use a default that scales logarithmically
                         with the number of policy parameters
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not (isinstance(pop_size, int) or pop_size is None):
            raise pyrado.TypeErr(given=pop_size, expected_type=int)
        if isinstance(pop_size, int) and pop_size <= 0:
            raise pyrado.ValueErr(given=pop_size, g_constraint='0')

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Store the inputs
        self._env = env
        self.num_rollouts = num_rollouts

        # Auto-select population size if needed
        if pop_size is None:
            pop_size = 4 + int(3 * np.log(policy.num_param))
            print_cbt(f'Initialized population size to {pop_size}.', 'y')
        self.pop_size = pop_size

        # Create sampler
        self.sampler = ParameterExplorationSampler(
            env,
            policy,
            num_workers=num_workers,
            num_rollouts_per_param=num_rollouts,
        )

        # Stopping criterion
        self.ret_avg_stack = 1e3 * np.random.randn(20)  # stack size = 20
        self.thold_ret_std = 1e-1  # algorithm terminates if below for multiple iterations

        # Saving the best policy (this is not the mean for policy parameter exploration)
        self.best_policy_param = policy.param_values.clone()

        # Set this in subclasses
        self._expl_strat = None

    @property
    def env(self) -> Env:
        """ Get the environment in which the algorithm exploration trains. """
        return self._env

    @property
    def expl_strat(self) -> StochasticParamExplStrat:
        return self._expl_strat

    def stopping_criterion_met(self) -> bool:
        """
        Check if the average reward of the mean policy did not change more than the specified threshold over the
        last iterations.
        """
        if np.std(self.ret_avg_stack) < self.thold_ret_std:
            return True
        else:
            return False

    def reset(self, seed: int = None):
        # Reset the exploration strategy, internal variables and the random seeds
        super().reset(seed)

    def step(self, snapshot_mode: str, meta_info: dict = None):
        # Sample new policy parameters
        param_sets = self._expl_strat.sample_param_sets(
            self._policy.param_values,
            self.pop_size,
            # If you do not want to include the current policy parameters, be aware that you also have to do follow-up
            # changes in the update() functions in all subclasses of ParameterExploring
            include_nominal_params=True)

        with to.no_grad():
            # Sample rollouts using these parameters
            param_samp_res = self.sampler.sample(param_sets)

        # Evaluate the current policy (first one in list if include_nominal_params is True)
        ret_avg_curr = param_samp_res[0].mean_undiscounted_return

        # Store the average return for the stopping criterion
        self.ret_avg_stack = np.delete(self.ret_avg_stack, 0)
        self.ret_avg_stack = np.append(self.ret_avg_stack, ret_avg_curr)

        all_rets = param_samp_res.mean_returns
        all_lengths = np.array(
            [len(ro) for pss in param_samp_res for ro in pss.rollouts])

        # Log metrics computed from the old policy (before the update)
        self._cnt_samples += int(np.sum(all_lengths))
        self.logger.add_value('curr policy return', ret_avg_curr, 4)
        self.logger.add_value('max return', np.max(all_rets), 4)
        self.logger.add_value('median return', np.median(all_rets), 4)
        self.logger.add_value('min return', np.min(all_rets), 4)
        self.logger.add_value('avg return', np.mean(all_rets), 4)
        self.logger.add_value('std return', np.std(all_rets), 4)
        self.logger.add_value('avg rollout len', np.mean(all_lengths), 4)
        self.logger.add_value('num total samples', self._cnt_samples)
        self.logger.add_value(
            'min mag policy param', self._policy.param_values[to.argmin(
                abs(self._policy.param_values))])
        self.logger.add_value(
            'max mag policy param', self._policy.param_values[to.argmax(
                abs(self._policy.param_values))])

        # Extract the best policy parameter sample for saving it later
        self.best_policy_param = param_samp_res.parameters[np.argmax(
            param_samp_res.mean_returns)].clone()

        # Update the policy
        self.update(param_samp_res, ret_avg_curr)

        # Save snapshot data
        self.make_snapshot(snapshot_mode,
                           float(np.max(param_samp_res.mean_returns)),
                           meta_info)

    @abstractmethod
    def update(self, param_results: ParameterSamplingResult,
               ret_avg_curr: float):
        """
        Update the policy from the given samples.

        :param param_results: Sampled parameters with evaluation
        :param ret_avg_curr: Average return for the current parameters
        """
        raise NotImplementedError

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # Save the best element of the current population
        best_policy = deepcopy(self._policy)
        best_policy.param_values = self.best_policy_param
        pyrado.save(best_policy, 'policy', 'pt', self.save_dir, meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)