Пример #1
0
    def __call__(self, n_to_produce: Union[int, tf.Tensor], limits: Space,
                 dtype):
        rnd_samples = []
        thresholds_unscaled_list = []
        weights = ztf.constant(1., shape=(1, ))

        for (lower, upper), area in zip(limits.iter_limits(as_tuple=True),
                                        limits.iter_areas(rel=True)):
            n_partial_to_produce = tf.to_int64(
                ztf.to_real(n_to_produce) *
                ztf.to_real(area))  # TODO(Mayou36): split right!
            lower = ztf.convert_to_tensor(lower, dtype=dtype)
            upper = ztf.convert_to_tensor(upper, dtype=dtype)
            sample_drawn = tf.random_uniform(
                shape=(n_partial_to_produce, limits.n_obs + 1),
                # + 1 dim for the function value
                dtype=ztypes.float)
            rnd_sample = sample_drawn[:, :-1] * (
                upper - lower) + lower  # -1: all except func value
            thresholds_unscaled = sample_drawn[:, -1]
            # if not multiple_limits:
            #     return rnd_sample, thresholds_unscaled
            rnd_samples.append(rnd_sample)
            thresholds_unscaled_list.append(thresholds_unscaled)

        rnd_sample = tf.concat(rnd_samples, axis=0)
        thresholds_unscaled = tf.concat(thresholds_unscaled_list, axis=0)

        n_drawn = n_to_produce
        return rnd_sample, thresholds_unscaled, weights, weights, n_drawn
Пример #2
0
def create_params2(nameadd=""):
    mu2 = Parameter("mu25" + nameadd,
                    ztf.to_real(mu_true) - 0.2, mu_true - 1., mu_true + 1.)
    sigma2 = Parameter("sigma25" + nameadd,
                       ztf.to_real(sigma_true) - 0.3, sigma_true - 2.,
                       sigma_true + 2.)
    return mu2, sigma2
Пример #3
0
def create_params1(nameadd=""):
    mu1 = Parameter("mu1" + nameadd,
                    ztf.to_real(mu_true) - 0.2, mu_true - 1., mu_true + 1.)
    sigma1 = Parameter("sigma1" + nameadd,
                       ztf.to_real(sigma_true) - 0.3, sigma_true - 2.,
                       sigma_true + 2.)
    return mu1, sigma1
Пример #4
0
 def _analytic_integrate(self, limits, norm_range):
     lower, upper = limits.limits
     if np.all(-np.array(lower) == np.array(upper)) and np.all(np.array(upper) == np.infty):
         return ztf.to_real(1.)  # tfp distributions are normalized to 1
     lower = ztf.to_real(lower[0], dtype=self.dtype)
     upper = ztf.to_real(upper[0], dtype=self.dtype)
     integral = self.distribution.cdf(upper) - self.distribution.cdf(lower)
     return integral[0]
Пример #5
0
def create_params3(nameadd=""):
    mu3 = Parameter("mu35" + nameadd,
                    ztf.to_real(mu_true) - 0.2, mu_true - 1., mu_true + 1.)
    sigma3 = Parameter("sigma35" + nameadd,
                       ztf.to_real(sigma_true) - 0.3, sigma_true - 2.,
                       sigma_true + 2.)
    yield3 = Parameter("yield35" + nameadd, yield_true + 300, 0,
                       yield_true + 20000)
    return mu3, sigma3, yield3
Пример #6
0
def convert_to_parameter(value) -> "Parameter":
    """Convert a *numerical* to a fixed parameter or return if already a parameter.

    Args:
        value ():
    """
    if isinstance(value, tf.Variable):
        return value

    # convert to Tensor if not yet
    if not isinstance(value, tf.Tensor):
        if isinstance(value, complex):
            value = ztf.to_complex(value)
        else:
            value = ztf.to_real(value)

    if value.dtype.is_complex:
        value = ComplexParameter("FIXED_autoparam_" + str(get_auto_number()),
                                 value=value)

    else:
        # value = Parameter("FIXED_autoparam_" + str(get_auto_number()), value=value, floating=False)
        independend_params = tf.get_collection("zfit_independent")
        params = get_dependents(tensor=value, candidates=independend_params)
        if params:
            value = ComposedParameter("composite_autoparam_" +
                                      str(get_auto_number()),
                                      tensor=value)
        else:
            value = Parameter("FIXED_autoparam_" + str(get_auto_number()),
                              value=value,
                              floating=False)

    # value.floating = False
    return value
Пример #7
0
    def pdf(self,
            x: ztyping.XTypeInput,
            norm_range: ztyping.LimitsTypeInput = None,
            name: str = "model") -> ztyping.XType:
        """Probability density function, normalized over `norm_range`.

        Args:
          x (numerical): `float` or `double` `Tensor`.
          norm_range (tuple, :py:class:`~zfit.Space`): :py:class:`~zfit.Space` to normalize over
          name (str): Prepended to names of ops created by this function.

        Returns:
          :py:class:`tf.Tensor` of type `self.dtype`.
        """
        norm_range = self._check_input_norm_range(norm_range,
                                                  caller_name=name,
                                                  none_is_error=True)
        with self._convert_sort_x(x) as x:
            value = self._single_hook_pdf(x=x,
                                          norm_range=norm_range,
                                          name=name)
            if run.numeric_checks:
                assert_op = ztf.check_numerics(
                    value,
                    message="Check if pdf output contains any NaNs of Infs")
                assert_op = [assert_op]
            else:
                assert_op = []
            with tf.control_dependencies(assert_op):
                return ztf.to_real(value)
Пример #8
0
def convert_to_parameter(value,
                         name=None,
                         prefer_floating=False) -> "ZfitParameter":
    """Convert a *numerical* to a fixed parameter or return if already a parameter.

    Args:
        value ():
    """
    floating = False
    is_python = False
    if name is not None:
        name = str(name)

    if isinstance(
            value,
            ZfitParameter):  # TODO(Mayou36): autoconvert variable. TF 2.0?
        return value
    elif isinstance(value, tf.Variable):
        raise TypeError(
            "Currently, cannot autoconvert tf.Variable to zfit.Parameter.")

    # convert to Tensor if not yet
    if not isinstance(value, tf.Tensor):
        is_python = True
        if isinstance(value, complex):
            value = ztf.to_complex(value)
        else:
            floating = prefer_floating
            value = ztf.to_real(value)

    if not run._enable_parameter_autoconversion:
        return value

    if value.dtype.is_complex:
        if name is None:
            name = "FIXED_complex_autoparam_" + str(get_auto_number())
        value = ComplexParameter(name, value=value, floating=False)

    else:
        # value = Parameter("FIXED_autoparam_" + str(get_auto_number()), value=value, floating=False)
        if is_python:
            params = {}
        else:
            independend_params = tf.get_collection("zfit_independent")
            params = get_dependents_auto(tensor=value,
                                         candidates=independend_params)
        if params:
            if name is None:
                name = "composite_autoparam_" + str(get_auto_number())
            value = ComposedParameter(name, tensor=value)
        else:
            if name is None:
                name = "FIXED_autoparam_" + str(get_auto_number())
            value = Parameter(name, value=value, floating=floating)

    # value.floating = False
    return value
Пример #9
0
 def _loss_func(self, model, data, fit_range, constraints):
     nll = super()._loss_func(model=model, data=data, fit_range=fit_range, constraints=constraints)
     poisson_terms = []
     for mod, dat in zip(model, data):
         if not mod.is_extended:
             raise NotExtendedPDFError("The pdf {} is not extended but has to be (for an extended fit)".format(mod))
         nevents = dat.nevents if dat.weights is None else ztf.reduce_sum(dat.weights)
         poisson_terms.append(-mod.get_yield() + ztf.to_real(nevents) * tf.log(mod.get_yield()))
     nll -= tf.reduce_sum(poisson_terms)
     return nll
Пример #10
0
    def step_size(self):  # TODO: improve default step_size?
        step_size = self._step_size
        if step_size is None:
            # auto-infer from limits
            step_splits = 1e4
            # step_size = (self.upper_limit - self.lower_limit) / step_splits  # TODO improve? can be tensor?
            step_size = 0.001
            if step_size == np.nan:
                if self.lower_limit == -np.infty or self.upper_limit == np.infty:
                    step_size = 0.001
                else:
                    raise ValueError("Could not set step size. Is NaN.")
            # TODO: how to deal with infinities?
            step_size = ztf.to_real(step_size)
            self.step_size = step_size

        return step_size
Пример #11
0
    def set_weights(self, weights: ztyping.WeightsInputType):
        """Set (temporarily) the weights of the dataset.

        Args:
            weights (`tf.Tensor`, np.ndarray, None):


        """
        if weights is not None:
            weights = ztf.convert_to_tensor(weights)
            weights = ztf.to_real(weights)
            if weights.shape.ndims != 1:
                raise ShapeIncompatibleError(
                    "Weights have to be 1-Dim objects.")

        def setter(value):
            self._weights = value

        def getter():
            return self.weights

        return TemporarilySet(value=weights, getter=getter, setter=setter)
Пример #12
0
    def __init__(self,
                 data: tf.Tensor,
                 bandwidth: ztyping.ParamTypeInput,
                 obs: ztyping.ObsTypeInput,
                 name: str = "GaussianKDE"):
        """Gaussian Kernel Density Estimation using Silverman's rule of thumb

        Args:
            data: Data points to build a kernel around
            bandwidth: sigmas for the covariance matrix of the multivariate gaussian
            obs:
            name: Name of the PDF
        """
        dtype = zfit.settings.ztypes.float
        if isinstance(data, zfit.core.interfaces.ZfitData):

            raise WorkInProgressError("Currently, no dataset supported yet")
            # size = data.nevents
            # dims = data.n_obs
            # with data.
            # data = data.value()
            # if data.weights is not None:

        else:
            if not isinstance(data, tf.Tensor):
                data = ztf.convert_to_tensor(value=data)
            data = ztf.to_real(data)

            shape_data = tf.shape(data)
            size = tf.cast(shape_data[0], dtype=dtype)
            dims = tf.cast(shape_data[-1], dtype=dtype)
        bandwidth = convert_to_container(bandwidth)

        # Bandwidth definition, use silverman's rule of thumb for nd
        def reshaped_kerner_factory():
            cov = tf.linalg.diag([
                tf.square((4. / (dims + 2.))**(1 / (dims + 4)) *
                          size**(-1 / (dims + 4)) * s) for s in bandwidth
            ])
            # kernel prob output shape: (n,)
            kernel = tfd.MultivariateNormalFullCovariance(
                loc=data, covariance_matrix=cov)
            return tfd.Independent(kernel)

        # reshaped_kernel = kernel

        probs = tf.broadcast_to(1 / size, shape=(tf.cast(size, tf.int32), ))
        categorical = tfd.Categorical(
            probs=probs)  # no grad -> no need to recreate
        dist_kwargs = lambda: dict(mixture_distribution=categorical,
                                   components_distribution=
                                   reshaped_kerner_factory())
        distribution = tfd.MixtureSameFamily
        # TODO lambda for params
        params = OrderedDict(
            (f"bandwidth_{i}", h) for i, h in enumerate(bandwidth))
        super().__init__(distribution=distribution,
                         dist_params={},
                         dist_kwargs=dist_kwargs,
                         params=params,
                         obs=obs,
                         name=name)
Пример #13
0
    def sample_body(n,
                    sample,
                    n_produced=0,
                    n_total_drawn=0,
                    eff=1.0,
                    is_sampled=None):
        eff = tf.reduce_max([eff, ztf.to_real(1e-6)])

        n_to_produce = n - n_produced

        if isinstance(
                limits,
                EventSpace):  # EXPERIMENTAL(Mayou36): added to test EventSpace
            limits.create_limits(n=n)

        do_print = settings.get_verbosity() > 5
        if do_print:
            print_op = tf.print("Number of samples to produce:", n_to_produce,
                                " with efficiency ", eff)
        with tf.control_dependencies([print_op] if do_print else []):
            n_to_produce = tf.identity(n_to_produce)
        if dynamic_array_shape:
            n_to_produce = tf.to_int32(ztf.to_real(n_to_produce) / eff *
                                       1.01) + 10  # just to make sure
            # TODO: adjustable efficiency cap for memory efficiency (prevent too many samples at once produced)
            n_to_produce = tf.minimum(
                n_to_produce,
                tf.to_int32(8e5))  # introduce a cap to force serial
            new_limits = limits
        else:
            # TODO(Mayou36): add cap for n_to_produce here as well
            if multiple_limits:
                raise DueToLazynessNotImplementedError(
                    "Multiple limits for fixed event space not yet implemented"
                )
            is_not_sampled = tf.logical_not(is_sampled)
            (lower, ), (upper, ) = limits.limits
            lower = tuple(
                tf.boolean_mask(low, is_not_sampled) for low in lower)
            upper = tuple(tf.boolean_mask(up, is_not_sampled) for up in upper)
            new_limits = limits.with_limits(limits=((lower, ), (upper, )))
            draw_indices = tf.where(is_not_sampled)

        with tf.control_dependencies([n_to_produce]):
            rnd_sample, thresholds_unscaled, weights, weights_max, n_drawn = sample_and_weights(
                n_to_produce=n_to_produce, limits=new_limits, dtype=dtype)

        n_drawn = tf.cast(n_drawn, dtype=tf.int32)
        if run.numeric_checks:
            assert_op_n_drawn = tf.assert_non_negative(n_drawn)
            tfdeps = [assert_op_n_drawn]
        else:
            tfdeps = []
        with tf.control_dependencies(tfdeps):
            n_total_drawn += n_drawn

            probabilities = prob(rnd_sample)
        shape_rnd_sample = tf.shape(rnd_sample)[0]
        if run.numeric_checks:
            assert_prob_rnd_sample_op = tf.assert_equal(
                tf.shape(probabilities), shape_rnd_sample)
            tfdeps = [assert_prob_rnd_sample_op]
        else:
            tfdeps = []
        # assert_weights_rnd_sample_op = tf.assert_equal(tf.shape(weights), shape_rnd_sample)
        # print_op = tf.print("shapes: ", tf.shape(weights), shape_rnd_sample, "shapes end")
        with tf.control_dependencies(tfdeps):
            probabilities = tf.identity(probabilities)
        if prob_max is None or weights_max is None:  # TODO(performance): estimate prob_max, after enough estimations -> fix it?
            # TODO(Mayou36): This control dependency is needed because otherwise the max won't be determined
            # correctly. A bug report on will be filled (WIP).
            # The behavior is very odd: if we do not force a kind of copy, the `reduce_max` returns
            # a value smaller by a factor of 1e-14
            # with tf.control_dependencies([probabilities]):
            # UPDATE: this works now? Was it just a one-time bug?
            weights_scaling = tf.reduce_max(probabilities / weights)
        else:
            weights_scaling = prob_max / weights_max

        weights_scaled = weights_scaling * weights
        random_thresholds = thresholds_unscaled * weights_scaled
        if run.numeric_checks:
            assert_op = [
                tf.assert_greater_equal(
                    x=weights_scaled,
                    y=probabilities,
                    message="Not all weights are >= probs so the sampling "
                    "will be biased. If a custom `sample_and_weights` "
                    "was used, make sure that either the shape of the "
                    "custom sampler (resp. it's weights) overlap better "
                    "or decrease the `max_weight`")
            ]
        else:
            assert_op = []
        with tf.control_dependencies(assert_op):
            take_or_not = probabilities > random_thresholds
        take_or_not = take_or_not[0] if len(
            take_or_not.shape) == 2 else take_or_not
        filtered_sample = tf.boolean_mask(rnd_sample, mask=take_or_not, axis=0)

        n_accepted = tf.shape(filtered_sample)[0]
        n_produced_new = n_produced + n_accepted
        if not dynamic_array_shape:
            indices = tf.boolean_mask(draw_indices, mask=take_or_not)
            current_sampled = tf.sparse_tensor_to_dense(tf.SparseTensor(
                indices=indices,
                values=tf.broadcast_to(input=(True, ), shape=(n_accepted, )),
                dense_shape=(tf.cast(n, dtype=tf.int64), )),
                                                        default_value=False)
            is_sampled = tf.logical_or(is_sampled, current_sampled)
            indices = indices[:, 0]
        else:
            indices = tf.range(n_produced, n_produced_new)

        sample_new = sample.scatter(indices=tf.cast(indices, dtype=tf.int32),
                                    value=filtered_sample)

        # efficiency (estimate) of how many samples we get
        eff = tf.reduce_max([ztf.to_real(n_produced_new),
                             ztf.to_real(1.)]) / tf.reduce_max(
                                 [ztf.to_real(n_total_drawn),
                                  ztf.to_real(1.)])
        return n, sample_new, n_produced_new, n_total_drawn, eff, is_sampled
Пример #14
0
from zfit.core.parameter import Parameter
import zfit.settings
from zfit.core.loss import _unbinned_nll_tf, UnbinnedNLL
from zfit.util.exception import IntentionNotUnambiguousError

mu_true = 1.2
sigma_true = 4.1
mu_true2 = 1.01
sigma_true2 = 3.5

yield_true = 3000
test_values_np = np.random.normal(loc=mu_true, scale=sigma_true, size=(yield_true, 1))
test_values_np2 = np.random.normal(loc=mu_true2, scale=sigma_true2, size=yield_true)

low, high = -24.3, 28.6
mu1 = Parameter("mu1", ztf.to_real(mu_true) - 0.2, mu_true - 1., mu_true + 1.)
sigma1 = Parameter("sigma1", ztf.to_real(sigma_true) - 0.3, sigma_true - 2., sigma_true + 2.)
mu2 = Parameter("mu25", ztf.to_real(mu_true) - 0.2, mu_true - 1., mu_true + 1.)
sigma2 = Parameter("sigma25", ztf.to_real(sigma_true) - 0.3, sigma_true - 2., sigma_true + 2.)
mu3 = Parameter("mu35", ztf.to_real(mu_true) - 0.2, mu_true - 1., mu_true + 1.)
sigma3 = Parameter("sigma35", ztf.to_real(sigma_true) - 0.3, sigma_true - 2., sigma_true + 2.)
yield3 = Parameter("yield35", yield_true + 300, 0, yield_true + 20000)

obs1 = 'obs1'

mu_constr = [1.6, 0.2]  # mu, sigma
sigma_constr = [3.8, 0.2]

gaussian1 = Gauss(mu1, sigma1, obs=obs1, name="gaussian1")
gaussian2 = Gauss(mu2, sigma2, obs=obs1, name="gaussian2")
gaussian3 = Gauss(mu3, sigma3, obs=obs1, name="gaussian3")
Пример #15
0
    def sample_body(n,
                    sample,
                    n_produced=0,
                    n_total_drawn=0,
                    eff=1.0,
                    is_sampled=None,
                    weights_scaling=0.):
        eff = tf.reduce_max([eff, ztf.to_real(1e-6)])

        n_to_produce = n - n_produced

        if isinstance(
                limits,
                EventSpace):  # EXPERIMENTAL(Mayou36): added to test EventSpace
            limits.create_limits(n=n)

        do_print = settings.get_verbosity() > 5
        if do_print:
            print_op = tf.print("Number of samples to produce:", n_to_produce,
                                " with efficiency ", eff,
                                " with total produced ", n_produced,
                                " and total drawn ", n_total_drawn,
                                " with weights scaling", weights_scaling)
        with tf.control_dependencies([print_op] if do_print else []):
            n_to_produce = tf.identity(n_to_produce)
        if dynamic_array_shape:
            n_to_produce = tf.to_int32(
                ztf.to_real(n_to_produce) / eff *
                (1.1)) + 10  # just to make sure
            # TODO: adjustable efficiency cap for memory efficiency (prevent too many samples at once produced)
            max_produce_cap = tf.to_int32(8e5)
            safe_to_produce = tf.maximum(
                max_produce_cap,
                n_to_produce)  # protect against overflow, n_to_prod -> neg.
            n_to_produce = tf.minimum(
                safe_to_produce,
                max_produce_cap)  # introduce a cap to force serial
            new_limits = limits
        else:
            # TODO(Mayou36): add cap for n_to_produce here as well
            if multiple_limits:
                raise DueToLazynessNotImplementedError(
                    "Multiple limits for fixed event space not yet implemented"
                )
            is_not_sampled = tf.logical_not(is_sampled)
            (lower, ), (upper, ) = limits.limits
            lower = tuple(
                tf.boolean_mask(low, is_not_sampled) for low in lower)
            upper = tuple(tf.boolean_mask(up, is_not_sampled) for up in upper)
            new_limits = limits.with_limits(limits=((lower, ), (upper, )))
            draw_indices = tf.where(is_not_sampled)

        with tf.control_dependencies([n_to_produce]):
            rnd_sample, thresholds_unscaled, weights, weights_max, n_drawn = sample_and_weights(
                n_to_produce=n_to_produce, limits=new_limits, dtype=dtype)

        n_drawn = tf.cast(n_drawn, dtype=tf.int32)
        if run.numeric_checks:
            assert_op_n_drawn = tf.assert_non_negative(n_drawn)
            tfdeps = [assert_op_n_drawn]
        else:
            tfdeps = []
        with tf.control_dependencies(tfdeps):
            n_total_drawn += n_drawn

            probabilities = prob(rnd_sample)
        shape_rnd_sample = tf.shape(rnd_sample)[0]
        if run.numeric_checks:
            assert_prob_rnd_sample_op = tf.assert_equal(
                tf.shape(probabilities), shape_rnd_sample)
            tfdeps = [assert_prob_rnd_sample_op]
        else:
            tfdeps = []
        # assert_weights_rnd_sample_op = tf.assert_equal(tf.shape(weights), shape_rnd_sample)
        # print_op = tf.print("shapes: ", tf.shape(weights), shape_rnd_sample, "shapes end")
        with tf.control_dependencies(tfdeps):
            probabilities = tf.identity(probabilities)
        if prob_max is None or weights_max is None:  # TODO(performance): estimate prob_max, after enough estimations -> fix it?
            # TODO(Mayou36): This control dependency is needed because otherwise the max won't be determined
            # correctly. A bug report on will be filled (WIP).
            # The behavior is very odd: if we do not force a kind of copy, the `reduce_max` returns
            # a value smaller by a factor of 1e-14
            # with tf.control_dependencies([probabilities]):
            # UPDATE: this works now? Was it just a one-time bug?

            # safety margin, predicting future, improve for small samples?
            weights_maximum = tf.reduce_max(weights)
            weights_clipped = tf.maximum(weights, weights_maximum * 1e-5)
            # prob_weights_ratio = probabilities / weights
            prob_weights_ratio = probabilities / weights_clipped
            # min_prob_weights_ratio = tf.reduce_min(prob_weights_ratio)
            max_prob_weights_ratio = tf.reduce_max(prob_weights_ratio)
            ratio_threshold = 50000000.
            # clipping means that we don't scale more for a certain threshold
            # to properly account for very small numbers, the thresholds should be scaled to match the ratio
            # but if a weight of a sample is very low (compared to the other weights), this would force the acceptance
            # of other samples to decrease strongly. We introduce a cut here, meaning that any event with an acceptance
            # chance of less then 1 in ratio_threshold will be underestimated.
            # TODO(Mayou36): make ratio_threshold a global setting
            # max_prob_weights_ratio_clipped = tf.minimum(max_prob_weights_ratio,
            #                                             min_prob_weights_ratio * ratio_threshold)
            max_prob_weights_ratio_clipped = max_prob_weights_ratio
            weights_scaling = tf.maximum(
                weights_scaling, max_prob_weights_ratio_clipped * (1 + 1e-2))
        else:
            weights_scaling = prob_max / weights_max
            min_prob_weights_ratio = weights_scaling

        weights_scaled = weights_scaling * weights * (1 + 1e-8
                                                      )  # numerical epsilon
        random_thresholds = thresholds_unscaled * weights_scaled
        if run.numeric_checks:
            invalid_probs_weights = tf.greater(probabilities, weights_scaled)
            failed_weights = tf.boolean_mask(weights_scaled,
                                             mask=invalid_probs_weights)
            failed_probs = tf.boolean_mask(probabilities,
                                           mask=invalid_probs_weights)

            print_op = tf.print(
                "HACK WARNING: if the following is NOT empty, your sampling _may_ be biased."
                " Failed weights:", failed_weights, " failed probs",
                failed_probs)
            assert_no_failed_probs = tf.assert_equal(tf.shape(failed_weights),
                                                     [0])
            # assert_op = [print_op]
            assert_op = [assert_no_failed_probs]
            # for weights scaled more then ratio_threshold
            # assert_op = [tf.assert_greater_equal(x=weights_scaled, y=probabilities,
            #                                      data=[tf.shape(failed_weights), failed_weights, failed_probs],
            #                                      message="Not all weights are >= probs so the sampling "
            #                                              "will be biased. If a custom `sample_and_weights` "
            #                                              "was used, make sure that either the shape of the "
            #                                              "custom sampler (resp. it's weights) overlap better "
            #                                              "or decrease the `max_weight`")]
            #
            # # check disabled (below not added to deps)
            # assert_scaling_op = tf.assert_less(weights_scaling / min_prob_weights_ratio, ztf.constant(ratio_threshold),
            #                                    data=[weights_scaling, min_prob_weights_ratio],
            #                                    message="The ratio between the probabilities from the pdf and the"
            #                                    f"probability from the sampler is higher "
            #                                    f" then {ratio_threshold}. This will most probably bias the sampling. "
            #                                    f"Use importance sampling or, to disable this check, do"
            #                                    f"zfit.run.numeric_checks = False")
            # assert_op.append(assert_scaling_op)
        else:
            assert_op = []
        with tf.control_dependencies(assert_op):
            take_or_not = probabilities > random_thresholds
        take_or_not = take_or_not[0] if len(
            take_or_not.shape) == 2 else take_or_not
        filtered_sample = tf.boolean_mask(rnd_sample, mask=take_or_not, axis=0)

        n_accepted = tf.shape(filtered_sample)[0]
        n_produced_new = n_produced + n_accepted
        if not dynamic_array_shape:
            indices = tf.boolean_mask(draw_indices, mask=take_or_not)
            current_sampled = tf.sparse_tensor_to_dense(tf.SparseTensor(
                indices=indices,
                values=tf.broadcast_to(input=(True, ), shape=(n_accepted, )),
                dense_shape=(tf.cast(n, dtype=tf.int64), )),
                                                        default_value=False)
            is_sampled = tf.logical_or(is_sampled, current_sampled)
            indices = indices[:, 0]
        else:
            indices = tf.range(n_produced, n_produced_new)

        sample_new = sample.scatter(indices=tf.cast(indices, dtype=tf.int32),
                                    value=filtered_sample)

        # efficiency (estimate) of how many samples we get
        eff = tf.reduce_max([ztf.to_real(n_produced_new),
                             ztf.to_real(1.)]) / tf.reduce_max(
                                 [ztf.to_real(n_total_drawn),
                                  ztf.to_real(1.)])
        return n, sample_new, n_produced_new, n_total_drawn, eff, is_sampled, weights_scaling
Пример #16
0
def accept_reject_sample(
        prob: Callable,
        n: int,
        limits: Space,
        sample_and_weights_factory: Callable = UniformSampleAndWeights,
        dtype=ztypes.float,
        prob_max: Union[None, int] = None,
        efficiency_estimation: float = 1.0) -> tf.Tensor:
    """Accept reject sample from a probability distribution.

    Args:
        prob (function): A function taking x a Tensor as an argument and returning the probability
            (or anything that is proportional to the probability).
        n (int): Number of samples to produce
        limits (:py:class:`~zfit.Space`): The limits to sample from
        sample_and_weights_factory (Callable): A factory function that returns the following function:
            A function that returns the sample to insert into `prob` and the weights
            (probability density) of each sample together with the random thresholds. The API looks as follows:

            - Parameters:

                - n_to_produce (int, tf.Tensor): The number of events to produce (not exactly).
                - limits (Space): the limits in which the samples will be.
                - dtype (dtype): DType of the output.

            - Return:
                A tuple of length 5:
                - proposed sample (tf.Tensor with shape=(n_to_produce, n_obs)): The new (proposed) sample
                    whose values are inside `limits`.
                - thresholds_unscaled (tf.Tensor with shape=(n_to_produce,): Uniformly distributed
                    random values **between 0 and 1**.
                - weights (tf.Tensor with shape=(n_to_produce)): (Proportional to the) probability
                    for each sample of the distribution it was drawn from.
                - weights_max (int, tf.Tensor, None): The maximum of the weights (if known). This is
                    what the probability maximum will be scaled with, so it should be rather lower than the maximum
                    if the peaks do not exactly coincide. Otherwise return None (which will **assume**
                    that the peaks coincide).
                - n_produced: the number of events produced. Can deviate from the requested number.

        dtype ():
        prob_max (Union[None, int]): The maximum of the model function for the given limits. If None
            is given, it will be automatically, safely estimated (by a 10% increase in computation time
            (constant weak scaling)).
        efficiency_estimation (float): estimation of the initial sampling efficiency.

    Returns:
        tf.Tensor:
    """
    multiple_limits = limits.n_limits > 1

    sample_and_weights = sample_and_weights_factory()
    n = tf.to_int32(n)
    if run.numeric_checks:
        assert_valid_n_op = tf.assert_non_negative(n)
        deps = [assert_valid_n_op]
    else:
        deps = []
    # whether we may produce more then n, we normally do (except for EventSpace which is not a generator)
    # we cannot cut inside the while loop as soon as we have produced enough because we may sample from
    # multiple limits and therefore need to randomly remove events, otherwise we are biased because the
    # drawn samples are ordered in the different
    dynamic_array_shape = True

    # for fixed limits in EventSpace we need to know which indices have been successfully sampled. Therefore this
    # can be None (if not needed) or a boolean tensor with the size `n`
    initial_is_sampled = tf.constant("EMPTY")
    if isinstance(limits, EventSpace) and not limits.is_generator:
        dynamic_array_shape = False
        if run.numeric_checks:
            assert_n_matches_limits_op = tf.assert_equal(
                tf.shape(limits.lower[0][0])[0], n)
            tfdeps = [assert_n_matches_limits_op]
        else:
            tfdeps = []
        with tf.control_dependencies(
                tfdeps):  # TODO(Mayou36): good check? could be 1d
            initial_is_sampled = tf.fill(value=False, dims=(n, ))
        efficiency_estimation = 1.0  # generate exactly n
    with tf.control_dependencies(deps):
        inital_n_produced = tf.constant(0, dtype=tf.int32)
        initial_n_drawn = tf.constant(0, dtype=tf.int32)
        with tf.control_dependencies([n]):
            sample = tf.TensorArray(
                dtype=dtype,
                size=n,
                dynamic_size=dynamic_array_shape,
                clear_after_read=True,  # we read only once at end to tensor
                element_shape=(limits.n_obs, ))

    def not_enough_produced(n, sample, n_produced, n_total_drawn, eff,
                            is_sampled, weights_scaling):
        return tf.greater(n, n_produced)

    def sample_body(n,
                    sample,
                    n_produced=0,
                    n_total_drawn=0,
                    eff=1.0,
                    is_sampled=None,
                    weights_scaling=0.):
        eff = tf.reduce_max([eff, ztf.to_real(1e-6)])

        n_to_produce = n - n_produced

        if isinstance(
                limits,
                EventSpace):  # EXPERIMENTAL(Mayou36): added to test EventSpace
            limits.create_limits(n=n)

        do_print = settings.get_verbosity() > 5
        if do_print:
            print_op = tf.print("Number of samples to produce:", n_to_produce,
                                " with efficiency ", eff,
                                " with total produced ", n_produced,
                                " and total drawn ", n_total_drawn,
                                " with weights scaling", weights_scaling)
        with tf.control_dependencies([print_op] if do_print else []):
            n_to_produce = tf.identity(n_to_produce)
        if dynamic_array_shape:
            n_to_produce = tf.to_int32(
                ztf.to_real(n_to_produce) / eff *
                (1.1)) + 10  # just to make sure
            # TODO: adjustable efficiency cap for memory efficiency (prevent too many samples at once produced)
            max_produce_cap = tf.to_int32(8e5)
            safe_to_produce = tf.maximum(
                max_produce_cap,
                n_to_produce)  # protect against overflow, n_to_prod -> neg.
            n_to_produce = tf.minimum(
                safe_to_produce,
                max_produce_cap)  # introduce a cap to force serial
            new_limits = limits
        else:
            # TODO(Mayou36): add cap for n_to_produce here as well
            if multiple_limits:
                raise DueToLazynessNotImplementedError(
                    "Multiple limits for fixed event space not yet implemented"
                )
            is_not_sampled = tf.logical_not(is_sampled)
            (lower, ), (upper, ) = limits.limits
            lower = tuple(
                tf.boolean_mask(low, is_not_sampled) for low in lower)
            upper = tuple(tf.boolean_mask(up, is_not_sampled) for up in upper)
            new_limits = limits.with_limits(limits=((lower, ), (upper, )))
            draw_indices = tf.where(is_not_sampled)

        with tf.control_dependencies([n_to_produce]):
            rnd_sample, thresholds_unscaled, weights, weights_max, n_drawn = sample_and_weights(
                n_to_produce=n_to_produce, limits=new_limits, dtype=dtype)

        n_drawn = tf.cast(n_drawn, dtype=tf.int32)
        if run.numeric_checks:
            assert_op_n_drawn = tf.assert_non_negative(n_drawn)
            tfdeps = [assert_op_n_drawn]
        else:
            tfdeps = []
        with tf.control_dependencies(tfdeps):
            n_total_drawn += n_drawn

            probabilities = prob(rnd_sample)
        shape_rnd_sample = tf.shape(rnd_sample)[0]
        if run.numeric_checks:
            assert_prob_rnd_sample_op = tf.assert_equal(
                tf.shape(probabilities), shape_rnd_sample)
            tfdeps = [assert_prob_rnd_sample_op]
        else:
            tfdeps = []
        # assert_weights_rnd_sample_op = tf.assert_equal(tf.shape(weights), shape_rnd_sample)
        # print_op = tf.print("shapes: ", tf.shape(weights), shape_rnd_sample, "shapes end")
        with tf.control_dependencies(tfdeps):
            probabilities = tf.identity(probabilities)
        if prob_max is None or weights_max is None:  # TODO(performance): estimate prob_max, after enough estimations -> fix it?
            # TODO(Mayou36): This control dependency is needed because otherwise the max won't be determined
            # correctly. A bug report on will be filled (WIP).
            # The behavior is very odd: if we do not force a kind of copy, the `reduce_max` returns
            # a value smaller by a factor of 1e-14
            # with tf.control_dependencies([probabilities]):
            # UPDATE: this works now? Was it just a one-time bug?

            # safety margin, predicting future, improve for small samples?
            weights_maximum = tf.reduce_max(weights)
            weights_clipped = tf.maximum(weights, weights_maximum * 1e-5)
            # prob_weights_ratio = probabilities / weights
            prob_weights_ratio = probabilities / weights_clipped
            # min_prob_weights_ratio = tf.reduce_min(prob_weights_ratio)
            max_prob_weights_ratio = tf.reduce_max(prob_weights_ratio)
            ratio_threshold = 50000000.
            # clipping means that we don't scale more for a certain threshold
            # to properly account for very small numbers, the thresholds should be scaled to match the ratio
            # but if a weight of a sample is very low (compared to the other weights), this would force the acceptance
            # of other samples to decrease strongly. We introduce a cut here, meaning that any event with an acceptance
            # chance of less then 1 in ratio_threshold will be underestimated.
            # TODO(Mayou36): make ratio_threshold a global setting
            # max_prob_weights_ratio_clipped = tf.minimum(max_prob_weights_ratio,
            #                                             min_prob_weights_ratio * ratio_threshold)
            max_prob_weights_ratio_clipped = max_prob_weights_ratio
            weights_scaling = tf.maximum(
                weights_scaling, max_prob_weights_ratio_clipped * (1 + 1e-2))
        else:
            weights_scaling = prob_max / weights_max
            min_prob_weights_ratio = weights_scaling

        weights_scaled = weights_scaling * weights * (1 + 1e-8
                                                      )  # numerical epsilon
        random_thresholds = thresholds_unscaled * weights_scaled
        if run.numeric_checks:
            invalid_probs_weights = tf.greater(probabilities, weights_scaled)
            failed_weights = tf.boolean_mask(weights_scaled,
                                             mask=invalid_probs_weights)
            failed_probs = tf.boolean_mask(probabilities,
                                           mask=invalid_probs_weights)

            print_op = tf.print(
                "HACK WARNING: if the following is NOT empty, your sampling _may_ be biased."
                " Failed weights:", failed_weights, " failed probs",
                failed_probs)
            assert_no_failed_probs = tf.assert_equal(tf.shape(failed_weights),
                                                     [0])
            # assert_op = [print_op]
            assert_op = [assert_no_failed_probs]
            # for weights scaled more then ratio_threshold
            # assert_op = [tf.assert_greater_equal(x=weights_scaled, y=probabilities,
            #                                      data=[tf.shape(failed_weights), failed_weights, failed_probs],
            #                                      message="Not all weights are >= probs so the sampling "
            #                                              "will be biased. If a custom `sample_and_weights` "
            #                                              "was used, make sure that either the shape of the "
            #                                              "custom sampler (resp. it's weights) overlap better "
            #                                              "or decrease the `max_weight`")]
            #
            # # check disabled (below not added to deps)
            # assert_scaling_op = tf.assert_less(weights_scaling / min_prob_weights_ratio, ztf.constant(ratio_threshold),
            #                                    data=[weights_scaling, min_prob_weights_ratio],
            #                                    message="The ratio between the probabilities from the pdf and the"
            #                                    f"probability from the sampler is higher "
            #                                    f" then {ratio_threshold}. This will most probably bias the sampling. "
            #                                    f"Use importance sampling or, to disable this check, do"
            #                                    f"zfit.run.numeric_checks = False")
            # assert_op.append(assert_scaling_op)
        else:
            assert_op = []
        with tf.control_dependencies(assert_op):
            take_or_not = probabilities > random_thresholds
        take_or_not = take_or_not[0] if len(
            take_or_not.shape) == 2 else take_or_not
        filtered_sample = tf.boolean_mask(rnd_sample, mask=take_or_not, axis=0)

        n_accepted = tf.shape(filtered_sample)[0]
        n_produced_new = n_produced + n_accepted
        if not dynamic_array_shape:
            indices = tf.boolean_mask(draw_indices, mask=take_or_not)
            current_sampled = tf.sparse_tensor_to_dense(tf.SparseTensor(
                indices=indices,
                values=tf.broadcast_to(input=(True, ), shape=(n_accepted, )),
                dense_shape=(tf.cast(n, dtype=tf.int64), )),
                                                        default_value=False)
            is_sampled = tf.logical_or(is_sampled, current_sampled)
            indices = indices[:, 0]
        else:
            indices = tf.range(n_produced, n_produced_new)

        sample_new = sample.scatter(indices=tf.cast(indices, dtype=tf.int32),
                                    value=filtered_sample)

        # efficiency (estimate) of how many samples we get
        eff = tf.reduce_max([ztf.to_real(n_produced_new),
                             ztf.to_real(1.)]) / tf.reduce_max(
                                 [ztf.to_real(n_total_drawn),
                                  ztf.to_real(1.)])
        return n, sample_new, n_produced_new, n_total_drawn, eff, is_sampled, weights_scaling

    efficiency_estimation = ztf.to_real(efficiency_estimation)
    weights_scaling = ztf.constant(0.)
    loop_vars = (n, sample, inital_n_produced, initial_n_drawn,
                 efficiency_estimation, initial_is_sampled, weights_scaling)

    sample_array = tf.while_loop(
        cond=not_enough_produced,
        body=sample_body,  # paraopt
        loop_vars=loop_vars,
        swap_memory=True,
        parallel_iterations=1,
        back_prop=False)[1]  # backprop not needed here
    new_sample = sample_array.stack()
    if multiple_limits:
        new_sample = tf.random.shuffle(
            new_sample)  # to make sure, randomly remove and not biased.
    if dynamic_array_shape:  # if not dynamic we produced exact n -> no need to cut
        new_sample = new_sample[:n, :]  # cutting away to many produced

    # if no failure, uncomment both for improvement of shape inference, but what if n is tensor?
    # with suppress(AttributeError):  # if n_samples_int is not a numpy object
    #     new_sample.set_shape((n_samples_int, n_dims))
    return new_sample
Пример #17
0
    def sample_body(n, sample, n_total_drawn=0, eff=1.0):
        if sample is None:
            n_to_produce = n
        else:
            n_to_produce = n - tf.shape(sample, out_type=tf.int64)[0]
        do_print = settings.get_verbosity() > 5
        if do_print:
            print_op = tf.print("Number of samples to produce:", n_to_produce,
                                " with efficiency ", eff)
        with tf.control_dependencies([print_op] if do_print else []):
            n_to_produce = tf.to_int64(ztf.to_real(n_to_produce) / eff *
                                       1.01) + 100  # just to make sure
        # TODO: adjustable efficiency cap for memory efficiency (prevent too many samples at once produced)
        n_to_produce = tf.minimum(
            n_to_produce, tf.to_int64(5e5))  # introduce a cap to force serial

        rnd_sample, thresholds_unscaled, weights, weights_max, n_drawn = sample_and_weights(
            n_to_produce=n_to_produce, limits=limits, dtype=dtype)

        # if n_produced is None:
        #     raise ShapeIncompatibleError("`sample_and_weights` has to return thresholds with a defined shape."
        #                                  "Use `Tensor.set_shape()` if the automatic propagation of the shape "
        #                                  "is not available.")
        n_total_drawn += n_drawn
        n_total_drawn = tf.to_int64(n_total_drawn)

        probabilities = prob(rnd_sample)
        if prob_max is None:  # TODO(performance): estimate prob_max, after enough estimations -> fix it?
            # TODO(Mayou36): This control dependency is needed because otherwise the max won't be determined
            # correctly. A bug report on will be filled (WIP).
            # The behavior is very odd: if we do not force a kind of copy, the `reduce_max` returns
            # a value smaller by a factor of 1e-14
            # with tf.control_dependencies([probabilities]):
            # UPDATE: this works now? Was it just a one-time bug?
            prob_max_inferred = tf.reduce_max(probabilities)
        else:
            prob_max_inferred = prob_max

        if weights_max is None:
            weights_max = tf.reduce_max(
                weights
            ) * 0.99  # safety margin, also taking numericals into account

        weights_scaled = prob_max_inferred / weights_max * weights
        random_thresholds = thresholds_unscaled * weights_scaled
        if run.numeric_checks:
            assert_op = [
                tf.assert_greater_equal(
                    x=weights_scaled,
                    y=probabilities,
                    message="Not all weights are >= probs so the sampling "
                    "will be biased. If a custom `sample_and_weights` "
                    "was used, make sure that either the shape of the "
                    "custom sampler (resp. it's weights) overlap better "
                    "or decrease the `max_weight`")
            ]
        else:
            assert_op = []
        with tf.control_dependencies(assert_op):
            take_or_not = probabilities > random_thresholds
        # rnd_sample = tf.expand_dims(rnd_sample, dim=0) if len(rnd_sample.shape) == 1 else rnd_sample
        take_or_not = take_or_not[0] if len(
            take_or_not.shape) == 2 else take_or_not
        filtered_sample = tf.boolean_mask(rnd_sample, mask=take_or_not, axis=0)

        if sample is None:
            sample = filtered_sample
        else:
            sample = tf.concat([sample, filtered_sample], axis=0)

        # efficiency (estimate) of how many samples we get
        eff = ztf.to_real(tf.shape(
            sample, out_type=tf.int64)[1]) / ztf.to_real(n_total_drawn)
        return n, sample, n_total_drawn, eff
Пример #18
0
 def real(self):
     real = self._real
     if real is None:
         real = ztf.to_real(self)
     return real