def test_bin_to_group_indices(size=100, bins=10): bin_indices = RandomState().randint(0, bins, size=size) mask = RandomState().randint(0, 2, size=size) > 0.5 group_indices = bin_to_group_indices(bin_indices, mask=mask) assert numpy.sum([len(group) for group in group_indices]) == numpy.sum(mask) a = numpy.sort(numpy.concatenate(group_indices)) b = numpy.where(mask > 0.5)[0] assert numpy.all(a == b), 'group indices are computed wrongly'
def generate_binned_dataset(n_samples, n_bins): """useful function, generates dataset with bins, groups, random weights. This is used to test correlation functions. """ random = RandomState() y = random.uniform(size=n_samples) > 0.5 pred = random.uniform(size=(n_samples, 2)) weights = random.exponential(size=(n_samples,)) bins = random.randint(0, n_bins, n_samples) groups = bin_to_group_indices(bin_indices=bins, mask=(y == 1)) return y, pred, weights, bins, groups
def test_cvm(size=1000): y_pred = random.uniform(size=size) y = random.uniform(size=size) > 0.5 sample_weight = random.exponential(size=size) bin_indices = random.randint(0, 10, size=size) mask = y == 1 groups_indices = bin_to_group_indices(bin_indices=bin_indices, mask=mask) cvm1 = bin_based_cvm(y_pred[mask], sample_weight=sample_weight[mask], bin_indices=bin_indices[mask]) cvm2 = group_based_cvm(y_pred, mask=mask, sample_weight=sample_weight, groups_indices=groups_indices) assert numpy.allclose(cvm1, cvm2)
def test_groups_matrix(size=1000, bins=4): bin_indices = RandomState().randint(0, bins, size=size) mask = RandomState().randint(0, 2, size=size) > 0.5 n_signal_events = numpy.sum(mask) group_indices = bin_to_group_indices(bin_indices, mask=mask) assert numpy.sum([len(group) for group in group_indices]) == n_signal_events group_matrix = group_indices_to_groups_matrix(group_indices, n_events=size) assert group_matrix.sum() == n_signal_events for event_id, (is_signal, bin) in enumerate(zip(mask, bin_indices)): assert group_matrix[bin, event_id] == is_signal
def _compute_groups_indices(self, X, y, label): """Returns a list, each element is events' indices in some group.""" label_mask = y == label extended_bin_limits = [] for var in self.uniform_features: extended_bin_limits.append(numpy.percentile(X[var][label_mask], numpy.linspace(0, 100, 2 * self.n_bins + 1))) groups_indices = list() for shift in [0, 1]: bin_limits = [] for axis_limits in extended_bin_limits: bin_limits.append(axis_limits[1 + shift:-1:2]) bin_indices = compute_bin_indices(X.ix[:, self.uniform_features].values, bin_limits=bin_limits) groups_indices += list(bin_to_group_indices(bin_indices, mask=label_mask)) return groups_indices