def build_histogram(self): # Get the events to estimate the PDF dimnames, bins = zip(*self.config['analysis_space']) mh = Histdd(bins=bins, axis_names=dimnames) # Get a generator function which will give us the events get = self.get_events_for_density_estimate if not inspect.isgeneratorfunction(get): def get(): return [self.get_events_for_density_estimate()] n_events = 0 for events, n_simulated in get(): n_events += n_simulated mh.add(*utils._events_to_analysis_dimensions(events, self.config['analysis_space'])) self.fraction_in_range = mh.n / n_events # Convert the histogram to a density estimate # This means we have to divide by # - the number of events IN RANGE received # (fraction_in_range keeps track of how many events were not in range) # - the bin sizes self._pdf_histogram = mh.similar_blank_hist() self._pdf_histogram.histogram = mh.histogram.astype(np.float) / mh.n # For the bin widths we need to take an outer product of several vectors, for which numpy has no builtin # This reduce trick does the job instead, see http://stackoverflow.com/questions/17138393 self._bin_volumes = reduce(np.multiply, np.ix_(*[np.diff(bs) for bs in bins])) self._pdf_histogram.histogram /= self._bin_volumes self._n_events_histogram = mh return mh
def build_histogram(self): # Get the events to estimate the PDF dimnames, bins = zip(*self.config['analysis_space']) mh = Histdd(bins=bins, axis_names=dimnames) # Get a generator function which will give us the events get = self.get_events_for_density_estimate if not inspect.isgeneratorfunction(get): def get(): return [self.get_events_for_density_estimate()] n_events = 0 for events, n_simulated in get(): n_events += n_simulated mh.add(*utils._events_to_analysis_dimensions( events, self.config['analysis_space'])) self.fraction_in_range = mh.n / n_events # Convert the histogram to a density estimate # This means we have to divide by # - the number of events IN RANGE received # (fraction_in_range keeps track of how many events were not in range) # - the bin sizes self._pdf_histogram = mh.similar_blank_hist() self._pdf_histogram.histogram = mh.histogram.astype(np.float) / mh.n # For the bin widths we need to take an outer product of several vectors, for which numpy has no builtin # This reduce trick does the job instead, see http://stackoverflow.com/questions/17138393 self._bin_volumes = reduce(np.multiply, np.ix_(*[np.diff(bs) for bs in bins])) self._pdf_histogram.histogram /= self._bin_volumes self._n_events_histogram = mh return mh
class BinnedLogLikelihood(LogLikelihoodBase): def __init__(self, pdf_base_config, likelihood_config=None, **kwargs): LogLikelihoodBase.__init__(self, pdf_base_config, likelihood_config, **kwargs) pdf_base_config['pdf_interpolation_method'] = 'piecewise' self.model_statistical_uncertainty_handling = self.config.get('model_statistical_uncertainty_handling') @inherit_docstring_from(LogLikelihoodBase) def prepare(self, *args): LogLikelihood.prepare(self, *args) self.ps, self.n_model_events = self.base_model.pmf_grids() if len(self.shape_parameters): self.ps_interpolator = self.morpher.make_interpolator(f=lambda m: m.pmf_grids()[0], extra_dims=list(self.ps.shape), anchor_models=self.anchor_models) if self.model_statistical_uncertainty_handling is not None: self.n_model_events_interpolator = self.morpher.make_interpolator(f=lambda m: m.pmf_grids()[1], extra_dims=list(self.ps.shape), anchor_models=self.anchor_models) @inherit_docstring_from(LogLikelihoodBase) def set_data(self, d): LogLikelihoodBase.set_data(self, d) # Bin the data in the analysis space dimnames, bins = zip(*self.base_model.config['analysis_space']) self.data_events_per_bin = Histdd(bins=bins, axis_names=dimnames) self.data_events_per_bin.add(*self.base_model.to_analysis_dimensions(d)) @inherit_docstring_from(LogLikelihoodBase) def _compute_single_pdf(self, **kwargs): model = self._compute_single_model(**kwargs) mus = model.expected_events() ps, n_model_events = model.pmf_grids() return mus, ps, n_model_events @_needs_data @inherit_docstring_from(LogLikelihoodBase) def adjust_expectations(self, mus, pmfs, n_model_events): if self.model_statistical_uncertainty_handling == 'bb_single': source_i = self.config.get('bb_single_source') if source_i is None: raise ValueError("You need to specify bb_single_source to use bb_single_source expectation adjustment") source_i = self.base_model.get_source_i(source_i) assert pmfs.shape == n_model_events.shape # Get the number of events expected for the sources we will NOT adjust counts_per_bin = pmfs.copy() for i, (mu, _x) in enumerate(zip(mus, counts_per_bin)): if i != source_i: _x *= mu else: _x *= 0. u_bins = np.sum(counts_per_bin, axis=0) p_calibration = mus[source_i] / n_model_events[source_i].sum() a_bins = n_model_events[source_i] A_bins_1, A_bins_2 = beeston_barlow_roots(a_bins, p_calibration, u_bins, self.data_events_per_bin.histogram) assert np.all(A_bins_1 <= 0) # it seems(?) the 1st root is always negative # For U=0, the solution above is singular; we need to use a special case instead A_bins_special = (self.data_events_per_bin.histogram + a_bins) / (1. + p_calibration) A_bins = np.choose(u_bins == 0, [A_bins_2, A_bins_special]) assert np.all(0 <= A_bins) pmfs[source_i] = A_bins / A_bins.sum() mus[source_i] = A_bins.sum() * p_calibration return mus, pmfs def _compute_likelihood(self, mus, pmfs): """Return binned Poisson log likelihood :param mus: numpy array with expected rates for each source :param pmfs: array (sources, *analysis_space) of PMFs for each source in each bin """ expected_counts = pmfs.copy() for mu, _p_bin_source in zip(mus, expected_counts): _p_bin_source *= mu # Works because of numpy view magic... expected_total = np.sum(expected_counts, axis=0) observed_counts = self.data_events_per_bin.histogram ret = observed_counts * np.log(expected_total) - expected_total - gammaln(observed_counts + 1.).real return np.sum(ret)
class TestHistdd(TestCase): def setUp(self): self.m = Histdd(range=test_range_2d, bins=test_bins_2d, axis_names=['foo', 'bar']) def test_is_instance(self): self.assertIsInstance(self.m, Histdd) def test_add_data(self): m = self.m x = [0.1, 0.8, -0.4] y = [0, 0, 0] m.add(x, y) self.assertEqual( m.histogram.tolist(), np.histogram2d(x, y, range=test_range_2d, bins=test_bins_2d)[0].tolist()) m.add(x, y) self.assertEqual(m.histogram.tolist(), (np.histogram2d( x * 2, y * 2, range=test_range_2d, bins=test_bins_2d)[0].tolist())) m.add([999, 999], [111, 111]) self.assertEqual( m.histogram.tolist(), np.histogram2d(x * 2, y * 2, range=test_range_2d, bins=test_bins_2d)[0].tolist()) def test_pandas(self): import pandas as pd m = self.m test_data = pd.DataFrame([{'foo': 0, 'bar': 0}, {'foo': 0, 'bar': 5}]) m.add(test_data) self.assertEqual( m.histogram.tolist(), np.histogram2d([0, 0], [0, 5], range=test_range_2d, bins=test_bins_2d)[0].tolist()) def test_projection(self): m = self.m x = [0.1, 0.8, -0.4] y = [0, 0, 0] m.add(x, y) p1 = m.projection(0) self.assertEqual(p1.histogram.tolist(), [1, 1, 1]) self.assertAlmostEqual( np.sum(p1.bin_edges - np.array([-1, -1 / 3, 1 / 3, 1])), 0) p2 = m.projection(1) self.assertEqual(p2.histogram.tolist(), [0, 3, 0]) self.assertAlmostEqual( np.sum(p2.bin_edges - np.array([-1, -1 / 3, 1 / 3, 1])), 0) p_2 = m.projection('bar') self.assertEqual(p2.histogram.tolist(), p_2.histogram.tolist()) self.assertEqual(p2.bin_edges.tolist(), p_2.bin_edges.tolist()) def test_cumulate(self): self.m.add([-1, 0, 1], [-10, 0, 10]) np.testing.assert_equal(self.m.histogram, np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])) np.testing.assert_equal( self.m.cumulate(0).histogram, np.array([[1, 0, 0], [1, 1, 0], [1, 1, 1]])) np.testing.assert_equal( self.m.cumulate(1).histogram, np.array([[1, 1, 1], [0, 1, 1], [0, 0, 1]])) np.testing.assert_equal( self.m.cumulate(1).histogram, self.m.cumulative_density(1).histogram) self.m.add([-1, 0, 1], [-10, 0, 10]) np.testing.assert_equal( self.m.cumulate(1).histogram, 2 * self.m.cumulative_density(1).histogram)
class TestHistdd(TestCase): def setUp(self): self.m = Histdd(range=test_range_2d, bins=test_bins_2d, axis_names=['foo', 'bar']) def test_is_instance(self): self.assertIsInstance(self.m, Histdd) def test_add_data(self): m = self.m x = [0.1, 0.8, -0.4] y = [0, 0, 0] m.add(x, y) self.assertEqual(m.histogram.tolist(), np.histogram2d(x, y, range=test_range_2d, bins=test_bins_2d)[0].tolist()) m.add(x, y) self.assertEqual(m.histogram.tolist(), (np.histogram2d(x*2, y*2, range=test_range_2d, bins=test_bins_2d)[0].tolist())) m.add([999, 999], [111, 111]) self.assertEqual(m.histogram.tolist(), np.histogram2d(x*2, y*2, range=test_range_2d, bins=test_bins_2d)[0].tolist()) def test_pandas(self): import pandas as pd m = self.m test_data = pd.DataFrame([{'foo': 0, 'bar': 0}, {'foo': 0, 'bar': 5}]) m.add(test_data) self.assertEqual(m.histogram.tolist(), np.histogram2d([0, 0], [0, 5], range=test_range_2d, bins=test_bins_2d)[0].tolist()) def test_projection(self): m = self.m x = [0.1, 0.8, -0.4] y = [0, 0, 0] m.add(x, y) p1 = m.projection(0) self.assertEqual(p1.histogram.tolist(), [1, 1, 1]) self.assertAlmostEqual(np.sum(p1.bin_edges - np.array([-1, -1/3, 1/3, 1])), 0) p2 = m.projection(1) self.assertEqual(p2.histogram.tolist(), [0, 3, 0]) self.assertAlmostEqual(np.sum(p2.bin_edges - np.array([-1, -1/3, 1/3, 1])), 0) p_2 = m.projection('bar') self.assertEqual(p2.histogram.tolist(), p_2.histogram.tolist()) self.assertEqual(p2.bin_edges.tolist(), p_2.bin_edges.tolist()) def test_cumulate(self): self.m.add([-1, 0, 1], [-10, 0, 10]) np.testing.assert_equal(self.m.histogram, np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])) np.testing.assert_equal(self.m.cumulate(0).histogram, np.array([[1, 0, 0], [1, 1, 0], [1, 1, 1]])) np.testing.assert_equal(self.m.cumulate(1).histogram, np.array([[1, 1, 1], [0, 1, 1], [0, 0, 1]])) np.testing.assert_equal(self.m.cumulate(1).histogram, self.m.cumulative_density(1).histogram) self.m.add([-1, 0, 1], [-10, 0, 10]) np.testing.assert_equal(self.m.cumulate(1).histogram, 2 * self.m.cumulative_density(1).histogram)