def cvm_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30): """ The most simple way to compute Cramer-von Mises flatness, this is however very slow if you need to compute it many times :param y: real classes of events, shape = [n_samples] :param proba: predicted probabilities, shape = [n_samples, n_classes] :param X: pandas.DataFrame with uniform features (i.e. test dataset) :param uniform_variables: features, along which uniformity is desired, list of strings :param sample_weight: weights of events, shape = [n_samples] :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal) :param knn: number of nearest neighbours used in knn Example of usage: proba = classifier.predict_proba(testX) cvm_flatness(testY, proba=proba, X=testX, uniform_variables=['mass']) """ y, proba = check_arrays(y, proba) assert len(y) == len(proba) == len(X), 'Different lengths' y = column_or_1d(y) sample_weight = check_sample_weight(y, sample_weight=sample_weight) X = pandas.DataFrame(X) signal_mask = y == label groups_indices = computeSignalKnnIndices(uniform_variables=uniform_variables, dataframe=X, is_signal=signal_mask, n_neighbors=knn) groups_indices = groups_indices[signal_mask, :] return ut.group_based_cvm(proba[:, label], mask=signal_mask, groups_indices=groups_indices, sample_weight=sample_weight)
def sde(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30): """ The most simple way to compute SDE, this is however very slow if you need to recompute SDE many times :param y: real classes of events, shape = [n_samples] :param proba: predicted probabilities, shape = [n_samples, n_classes] :param X: pandas.DataFrame with uniform features :param uniform_variables: features, along which uniformity is desired, list of strings :param sample_weight: weights of events, shape = [n_samples] :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal) :param knn: number of nearest neighbours used in knn Example of usage: proba = classifier.predict_proba(testX) sde(testY, proba=proba, X=testX, uniform_variables=['mass']) """ y, proba = check_arrays(y, proba) assert len(y) == len(proba) == len(X), 'Different lengths' y = column_or_1d(y) sample_weight = check_sample_weight(y, sample_weight=sample_weight) X = pandas.DataFrame(X) mask = y == label groups = computeSignalKnnIndices(uniform_variables=uniform_variables, dataframe=X, is_signal=mask, n_neighbors=knn) groups = groups[mask, :] return ut.compute_sde_on_groups( proba[:, label], mask=mask, groups_indices=groups, target_efficiencies=[0.5, 0.6, 0.7, 0.8, 0.9], sample_weight=sample_weight)
def fit(self, X, y, sample_weight=None, iterations=100, loss=None): X, y = check_arrays(X, y) self.n_features = X.shape[1] sample_weight = check_sample_weight(y, sample_weight=sample_weight) if loss is None: loss = BinomialDevianceLossFunction() loss.fit(X, y, sample_weight=sample_weight) self.coeffs = numpy.zeros( [self.compute_n_features(), 2**self.power_categories], dtype='float') y_pred = numpy.zeros(len(X), dtype='float') for iteration in range(iterations): print(iteration, loss(y_pred)) for feature, feature_values in enumerate( self.enumerate_features(X)): # TODO compute once per iteration! ngradient = loss.negative_gradient(y_pred) nominator = numpy.bincount(feature_values, weights=ngradient, minlength=2**self.power_categories) nominator -= 2 * self.l2_reg * self.coeffs[ feature, :] + self.l1_reg * numpy.sign( self.coeffs[feature, :]) denominator = numpy.abs(ngradient) * (1. - numpy.abs(ngradient)) denominator = numpy.bincount( feature_values, weights=denominator, minlength=2**self.power_categories) denominator += 2 * self.l2_reg gradients = nominator / denominator right_gradients = gradients # those already zeros not to become nonzero mask = (self.coeffs[feature, :] == 0) & (numpy.abs(gradients) < self.l1_reg) right_gradients[mask] = 0 # those already not zeros old_coeffs = self.coeffs[feature, :] new_coeffs = old_coeffs + self.learning_rate * right_gradients new_coeffs[new_coeffs * old_coeffs < 0] = 0 self.coeffs[feature, :] = new_coeffs y_diff = numpy.take(new_coeffs - old_coeffs, feature_values) y_pred += y_diff return self
def fit(self, X, y, sample_weight=None, iterations=100, loss=None): X, y = check_arrays(X, y) self.n_features = X.shape[1] sample_weight = check_sample_weight(y, sample_weight=sample_weight) if loss is None: loss = BinomialDevianceLossFunction() loss.fit(X, y, sample_weight=sample_weight) self.coeffs = numpy.zeros([self.compute_n_features(), 2 ** self.power_categories], dtype='float') y_pred = numpy.zeros(len(X), dtype='float') for iteration in range(iterations): print(iteration, loss(y_pred)) for feature, feature_values in enumerate(self.enumerate_features(X)): # TODO compute once per iteration! ngradient = loss.negative_gradient(y_pred) nominator = numpy.bincount(feature_values, weights=ngradient, minlength=2 ** self.power_categories) nominator -= 2 * self.l2_reg * self.coeffs[feature, :] + self.l1_reg * numpy.sign(self.coeffs[feature, :]) denominator = numpy.abs(ngradient) * (1. - numpy.abs(ngradient)) denominator = numpy.bincount(feature_values, weights=denominator, minlength=2 ** self.power_categories) denominator += 2 * self.l2_reg gradients = nominator / denominator right_gradients = gradients # those already zeros not to become nonzero mask = (self.coeffs[feature, :] == 0) & (numpy.abs(gradients) < self.l1_reg) right_gradients[mask] = 0 # those already not zeros old_coeffs = self.coeffs[feature, :] new_coeffs = old_coeffs + self.learning_rate * right_gradients new_coeffs[new_coeffs * old_coeffs < 0] = 0 self.coeffs[feature, :] = new_coeffs y_diff = numpy.take(new_coeffs - old_coeffs, feature_values) y_pred += y_diff return self