def _sample(self, n, limits: ZfitSpace): pdf = self.pdfs[0] # TODO: use real limits, currently not supported in binned sample sample = pdf.sample(n=n) edges = sample.space.binning.edges ndim = len(edges) edges = [znp.array(edge) for edge in edges] edges_flat = [znp.reshape(edge, [-1]) for edge in edges] lowers = [edge[:-1] for edge in edges_flat] uppers = [edge[1:] for edge in edges_flat] lowers_meshed = znp.meshgrid(*lowers, indexing="ij") uppers_meshed = znp.meshgrid(*uppers, indexing="ij") lowers_meshed_flat = [ znp.reshape(lower_mesh, [-1]) for lower_mesh in lowers_meshed ] uppers_meshed_flat = [ znp.reshape(upper_mesh, [-1]) for upper_mesh in uppers_meshed ] lower_flat = znp.stack(lowers_meshed_flat, axis=-1) upper_flat = znp.stack(uppers_meshed_flat, axis=-1) counts_flat = znp.reshape(sample.values(), (-1, )) counts_flat = tf.cast(counts_flat, znp.int32) # TODO: what if we have fractions? lower_flat_repeated = tf.repeat(lower_flat, counts_flat, axis=0) upper_flat_repeated = tf.repeat(upper_flat, counts_flat, axis=0) sample_unbinned = tf.random.uniform( (znp.sum(counts_flat), ndim), minval=lower_flat_repeated, maxval=upper_flat_repeated, dtype=self.dtype, ) return sample_unbinned
def _loss_func(self, model, data, fit_range, constraints, log_offset): nll = super()._loss_func( model=model, data=data, fit_range=fit_range, constraints=constraints, log_offset=log_offset, ) yields = [] nevents_collected = [] for mod, dat in zip(model, data): if not mod.is_extended: raise NotExtendedPDFError( f"The pdf {mod} is not extended but has to be (for an extended fit)" ) nevents = dat.n_events if dat.weights is None else z.reduce_sum( dat.weights) nevents = tf.cast(nevents, tf.float64) nevents_collected.append(nevents) yields.append(mod.get_yield()) yields = znp.stack(yields, axis=0) nevents_collected = znp.stack(nevents_collected, axis=0) term_new = tf.nn.log_poisson_loss(nevents_collected, znp.log(yields)) if log_offset is not None: term_new += log_offset nll += znp.sum(term_new, axis=0) return nll
def sumfunc(params): values = self.pdfs[0].counts(obs) sysshape = list(params.values()) if sysshape: sysshape_flat = tf.stack(sysshape) sysshape = znp.reshape(sysshape_flat, values.shape) values = values * sysshape return znp.sum(values)
def _nll_calc_unbinned_tf(log_probs, weights=None, log_offset=None): if weights is not None: log_probs *= weights # because it's prob ** weights if log_offset is not None: log_probs -= log_offset nll = -znp.sum(log_probs, axis=0) # nll = -tfp.math.reduce_kahan_sum(input_tensor=log_probs, axis=0) return nll
def __init__( self, data: ztyping.BinnedDataInputType, extended: Optional[ztyping.ExtendedInputType] = None, norm: Optional[ztyping.NormInputType] = None, name: str = "HistogramPDF", ) -> None: """Binned PDF resembling a histogram. Simple histogram PDF that can be used to model a histogram as a PDF. Args: data: Histogram to be used as PDF. extended: |@doc:pdf.init.extended| The overall yield of the PDF. If this is parameter-like, it will be used as the yield, the expected number of events, and the PDF will be extended. An extended PDF has additional functionality, such as the `ext_*` methods and the `counts` (for binned PDFs). |@docend:pdf.init.extended| |@doc:pdf.init.extended.auto| If `True`, the PDF will be extended automatically if the PDF is extended using the total number of events in the histogram. This is the default. |@docend:pdf.init.extended.auto| norm: |@doc:pdf.init.norm| Normalization of the PDF. By default, this is the same as the default space of the PDF. |@docend:pdf.init.norm| name: |@doc:model.init.name| Human-readable name or label of the PDF for better identification. Has no programmatical functional purpose as identification. |@docend:model.init.name| """ if extended is None: extended = True if not isinstance(data, ZfitBinnedData): if isinstance(data, PlottableHistogram): from zfit._data.binneddatav1 import BinnedData data = BinnedData.from_hist(data) else: raise TypeError( "data must be of type PlottableHistogram (UHI) or ZfitBinnedData" ) params = {} if extended is True: self._automatically_extended = True extended = znp.sum(data.values()) else: self._automatically_extended = False super().__init__(obs=data.space, extended=extended, norm=norm, params=params, name=name) self._data = data
def test_z_numpy_ndarray_is_tensorflow_tensor(): """In tensorflow 2.4.1 tf.experimental.numpy.ndarray was a wrapper around tf.Tensor. Now this concept seems to have been scratched and tf.experimental.numpy.ndarray is just an alias for tf.Tensor. See the commit history of https://github.com/tensorflow/tensorflow/commits/master/tensorflow/python/ops/numpy_ops/np_arrays.py """ assert znp.ndarray is tf.Tensor assert isinstance(znp.array(1), tf.Tensor) assert isinstance(znp.sum(znp.array(0)), tf.Tensor)
def _unbinned_nll_tf( model: ztyping.PDFInputType, data: ztyping.DataInputType, fit_range: ZfitSpace, log_offset=None, ): """Return the unbinned negative log likelihood for a PDF. Args: model: |@doc:loss.init.model| PDFs that return the normalized probability for *data* under the given parameters. If multiple model and data are given, they will be used in the same order to do a simultaneous fit. |@docend:loss.init.model| data: |@doc:loss.init.data| Dataset that will be given to the *model*. If multiple model and data are given, they will be used in the same order to do a simultaneous fit. |@docend:loss.init.data| fit_range: Returns: The unbinned nll """ if is_container(model): nlls = [ _unbinned_nll_tf(model=p, data=d, fit_range=r, log_offset=log_offset) for p, d, r in zip(model, data, fit_range) ] # nlls_total = [nll.total for nll in nlls] # nlls_correction = [nll.correction for nll in nlls] # nlls_total_summed = znp.sum(input_tensor=nlls_total, axis=0) nlls_summed = znp.sum(nlls, axis=0) # nlls_correction_summed = znp.sum(input_tensor=nlls_correction, axis=0) # nll_finished = (nlls_total_summed, nlls_correction_summed) nll_finished = nlls_summed else: if fit_range is not None: with data.set_data_range(fit_range): probs = model.pdf(data, norm_range=fit_range) else: probs = model.pdf(data) log_probs = znp.log(probs + znp.asarray(1e-307, dtype=znp.float64) ) # minor offset to avoid NaNs from log(0) nll = _nll_calc_unbinned_tf( log_probs=log_probs, weights=data.weights if data.weights is not None else None, log_offset=log_offset, ) nll_finished = nll return nll_finished
def test_sum_histogram_pdf(): bins1 = 5 bins2 = 7 counts = znp.random.uniform(high=1, size=(bins1, bins2)) # generate counts counts2 = np.random.normal(loc=5, size=(bins1, bins2)) counts3 = (znp.linspace(0, 10, num=bins1)[:, None] * znp.linspace(0, 5, num=bins2)[None, :]) binnings = [ zfit.binned.RegularBinning(bins1, 0, 10, name="obs1"), zfit.binned.RegularBinning(7, -10, bins2, name="obs2"), ] binning = binnings obs = zfit.Space(obs=["obs1", "obs2"], binning=binning) data = BinnedData.from_tensor(space=obs, values=counts, variances=znp.ones_like(counts) * 1.3) data2 = BinnedData.from_tensor(obs, counts2) data3 = BinnedData.from_tensor(obs, counts3) pdf = zfit.pdf.HistogramPDF(data=data, extended=znp.sum(counts)) pdf2 = zfit.pdf.HistogramPDF(data=data2, extended=znp.sum(counts2)) pdf3 = zfit.pdf.HistogramPDF(data=data3, extended=znp.sum(counts3)) assert len(pdf.ext_pdf(data)) > 0 pdf_sum = zfit.pdf.BinnedSumPDF(pdfs=[pdf, pdf2, pdf3], obs=obs) probs = pdf_sum.counts(data) true_sum_counts = counts + counts2 + counts3 np.testing.assert_allclose(true_sum_counts, probs) nsamples = 100_000_000 sample = pdf_sum.sample(n=nsamples) np.testing.assert_allclose(true_sum_counts, sample.values() / nsamples * pdf_sum.get_yield(), rtol=0.03) # integrate true_integral = znp.sum(true_sum_counts) integral = pdf_sum.ext_integrate(limits=obs) assert pytest.approx(float(true_integral)) == float(integral)
def _precompile(self): do_subtr = self._options.get("subtr_const", False) if do_subtr: if do_subtr is not True: self._options["subtr_const_value"] = do_subtr log_offset = self._options.get("subtr_const_value") if log_offset is None: from zfit import run run.assert_executing_eagerly() # first time subtr nevents_tot = znp.sum([d._approx_nevents for d in self.data]) log_offset_sum = ( self._call_value( data=self.data, model=self.model, fit_range=self.fit_range, constraints=self.constraints, # presumably were not at the minimum, # so the loss will decrease log_offset=z.convert_to_tensor(0.0), ) - 1000.0) log_offset = tf.stop_gradient( -znp.divide(log_offset_sum, nevents_tot)) self._options["subtr_const_value"] = log_offset
def _approx_nevents(self): return znp.sum(self.values())
def nevents(self): return znp.sum(self.values())
def _rel_counts(self, x, norm=None): values = self._counts_with_modifiers(x, norm) return values / znp.sum(values)
def tot_variances(x): nonlocal count count += 1 return znp.sum(x.variances)
def test_hypotest(benchmark, n_bins, hypotest, eager): """Benchmark the performance of pyhf.utils.hypotest() for various numbers of bins and different backends. Args: benchmark: pytest benchmark backend: `pyhf` tensorlib given by pytest parameterization n_bins: `list` of number of bins given by pytest parameterization Returns: None """ source = generate_source_static(n_bins) signp = source["bindata"]["sig"] bkgnp = source["bindata"]["bkg"] uncnp = source["bindata"]["bkgerr"] datanp = source["bindata"]["data"] if "pyhf" in hypotest: hypotest = hypotest_pyhf if eager: pyhf.set_backend("numpy") else: pyhf.set_backend("jax") pdf = uncorrelated_background(signp, bkgnp, uncnp) data = datanp + pdf.config.auxdata benchmark(hypotest, pdf, data) elif hypotest == "zfit": with zfit.run.set_graph_mode(not eager): hypotest = hypotest_zfit obs = zfit.Space( "signal", binning=zfit.binned.RegularBinning(n_bins, -0.5, n_bins + 0.5, name="signal"), ) zdata = zfit.data.BinnedData.from_tensor(obs, datanp) zmcsig = zfit.data.BinnedData.from_tensor(obs, signp) zmcbkg = zfit.data.BinnedData.from_tensor(obs, bkgnp) shapesys = { f"shapesys_{i}": zfit.Parameter(f"shapesys_{i}", 1, 0.1, 10) for i in range(n_bins) } bkgmodel = BinnedTemplatePDFV1(zmcbkg, sysshape=shapesys) # sigyield = zfit.Parameter('sigyield', znp.sum(zmcsig.values())) mu = zfit.Parameter("mu", 1, 0.1, 10) # sigmodeltmp = BinnedTemplatePDFV1(zmcsig) sigyield = zfit.ComposedParameter( "sigyield", lambda params: params["mu"] * znp.sum(zmcsig.values()), params={"mu": mu}, ) sigmodel = BinnedTemplatePDFV1(zmcsig, extended=sigyield) zmodel = BinnedSumPDF([sigmodel, bkgmodel]) unc = np.array(uncnp) / np.array(bkgnp) constraint = zfit.constraint.GaussianConstraint( list(shapesys.values()), np.ones_like(unc).tolist(), unc) nll = zfit.loss.ExtendedBinnedNLL(zmodel, zdata, constraints=constraint) minimizer = zfit.minimize.Minuit(tol=1e-3, gradient=False) nll.value() nll.value() nll.gradient() nll.gradient() benchmark(hypotest, minimizer, nll) assert True
def _rel_counts(self, x, norm=None): values = self._data.values() return values / znp.sum(values)
def create_poly(x, polys, coeffs, recurrence): degree = len(coeffs) - 1 polys = do_recurrence(x, polys=polys, degree=degree, recurrence=recurrence) sum_polys = znp.sum([coeff * poly for coeff, poly in zip(coeffs, polys)], axis=0) return sum_polys
def test_simple_examples_1D(): import zfit.data import zfit.z.numpy as znp bkgnp = [50.0, 60.0] signp = [5.0, 10.0] datanp = [60.0, 80.0] uncnp = [5.0, 12.0] serialized = ("""{ "channels": [ { "name": "singlechannel", "samples": [ { "name": "signal", """ + f""" "data": {signp}, """ """ "modifiers": [ { "name": "mu", "type": "normfactor", "data": null} ] }, { "name": "background", """ f'"data": {bkgnp},' """ "modifiers": [ {"name": "uncorr_bkguncrt", "type": "shapesys", """ f'"data": {uncnp}' """ } ] } ] } ], "observations": [ { """ f'"name": "singlechannel", "data": {datanp}' """ } ], "measurements": [ { "name": "Measurement", "config": {"poi": "mu", "parameters": []} } ], "version": "1.0.0" }""") obs = zfit.Space("signal", binning=zfit.binned.RegularBinning(2, 0, 2, name="signal")) zdata = zfit.data.BinnedData.from_tensor(obs, datanp) zmcsig = zfit.data.BinnedData.from_tensor(obs, signp) zmcbkg = zfit.data.BinnedData.from_tensor(obs, bkgnp) shapesys = { f"shapesys_{i}": zfit.Parameter(f"shapesys_{i}", 1, 0.1, 10) for i in range(2) } bkgmodel = BinnedTemplatePDFV1(zmcbkg, sysshape=shapesys) # sigyield = zfit.Parameter('sigyield', znp.sum(zmcsig.values())) mu = zfit.Parameter("mu", 1, 0.1, 10) # sigmodeltmp = BinnedTemplatePDFV1(zmcsig) sigyield = zfit.ComposedParameter( "sigyield", lambda params: params["mu"] * znp.sum(zmcsig.values()), params={"mu": mu}, ) sigmodel = BinnedTemplatePDFV1(zmcsig, extended=sigyield) zmodel = BinnedSumPDF([sigmodel, bkgmodel]) unc = np.array(uncnp) / np.array(bkgnp) nll = zfit.loss.ExtendedBinnedNLL( zmodel, zdata, constraints=zfit.constraint.GaussianConstraint(list(shapesys.values()), [1, 1], unc), ) # print(nll.value()) # print(nll.gradient()) # minimizer = zfit.minimize.ScipyLBFGSBV1() # minimizer = zfit.minimize.IpyoptV1() minimizer = zfit.minimize.Minuit(tol=1e-5, gradient=False) result = minimizer.minimize(nll) result.hesse(method="hesse_np") # result.errors() print(result) # mu_z = sigmodel.get_yield() / znp.sum(zmcsig.values()) zbestfit = zfit.run(result.params) errors = [p["hesse"]["error"] for p in result.params.values()] # print('minval actual:', nll.value(), nll.gradient()) # errors = np.ones(3) * 0.1 # print('mu:', mu_z) spec = json.loads(serialized) workspace = pyhf.Workspace(spec) model = workspace.model(poi_name="mu") pars = model.config.suggested_init() data = workspace.data(model) model.logpdf(pars, data) bestfit_pars, twice_nll = pyhf.infer.mle.fit(data, model, return_fitted_val=True) diff = (bestfit_pars - zbestfit) / errors # print(bestfit_pars) np.testing.assert_allclose(diff, 0, atol=1e-3)