def calibrate_probs(labels, weights, probs, logistic=False, random_state=11, threshold=0., return_calibrator=False, symmetrize=False): """ Calibrate output to probabilities using 2-folding to calibrate all data :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1 :param logistic: bool, use logistic or isotonic regression :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: calibrated probabilities """ labels = (labels > threshold) * 1 ind = numpy.arange(len(probs)) ind_1, ind_2 = train_test_split(ind, random_state=random_state, train_size=0.5) calibrator = LogisticRegression(C=100) if logistic else IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') est_calib_1, est_calib_2 = clone(calibrator), clone(calibrator) probs_1 = probs[ind_1] probs_2 = probs[ind_2] if logistic: probs_1 = numpy.clip(probs_1, 0.001, 0.999) probs_2 = numpy.clip(probs_2, 0.001, 0.999) probs_1 = logit(probs_1)[:, numpy.newaxis] probs_2 = logit(probs_2)[:, numpy.newaxis] if symmetrize: est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0]) est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0]) else: est_calib_1.fit(probs_1, labels[ind_1]) est_calib_2.fit(probs_2, labels[ind_2]) else: if symmetrize: est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0], numpy.r_[weights[ind_1], weights[ind_1]]) est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0], numpy.r_[weights[ind_2], weights[ind_2]]) else: est_calib_1.fit(probs_1, labels[ind_1], weights[ind_1]) est_calib_2.fit(probs_2, labels[ind_2], weights[ind_2]) calibrated_probs = numpy.zeros(len(probs)) if logistic: calibrated_probs[ind_1] = est_calib_2.predict_proba(probs_1)[:, 1] calibrated_probs[ind_2] = est_calib_1.predict_proba(probs_2)[:, 1] else: calibrated_probs[ind_1] = est_calib_2.transform(probs_1) calibrated_probs[ind_2] = est_calib_1.transform(probs_2) if return_calibrator: return calibrated_probs, (est_calib_1, est_calib_2) else: return calibrated_probs
def _compute_inds(self, length): ind = numpy.arange(length) ind_1, ind_2 = train_test_split(ind, random_state=self.random_state, train_size=0.5) return ind_1, ind_2
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, group_column=None, threshold=0., symmetrize=False): """ Bootstrap isotonic calibration: * randomly divide data into train-test * on train isotonic is fitted and applyed to test * on test using calibrated probs p(B+) D2 and auc are calculated :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1 :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: D2 array and auc array """ aucs = [] D2_array = [] labels = (labels > threshold) * 1 for _ in range(n_calibrations): if group_column is not None: train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split_group( group_column, probs, labels, weights, train_size=0.5) else: train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split( probs, labels, weights, train_size=0.5) iso_est = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') if symmetrize: iso_est.fit(numpy.r_[train_probs, 1-train_probs], numpy.r_[train_labels > 0, train_labels <= 0], numpy.r_[train_weights, train_weights]) else: iso_est.fit(train_probs, train_labels, train_weights) probs_calib = iso_est.transform(test_probs) alpha = (1 - 2 * probs_calib) ** 2 aucs.append(roc_auc_score(test_labels, test_probs, sample_weight=test_weights)) D2_array.append(numpy.average(alpha, weights=test_weights)) return D2_array, aucs
X = pd.concat((signal, backgr)) y = np.concatenate((np.ones(signal.shape[0]), np.zeros(backgr.shape[0]))) w = np.ones(len(X)) if primitiv: X = pd.DataFrame({'odin': np.array([2., 2., 2., 2., 3., 3., 2., 3., 8., 7., 8., 7., 8., 8., 7., 8.]), 'dwa': np.array([2.2, 2.1, 2.2, 2.3, 3.1, 3.1, 2.1, 3.2, 8.1, 7.5, 8.2, 7.1, 8.5, 8.2, 7.6, 8.1]) }) y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) w = np.ones(16) branch_names = ['odin', 'dwa'] print branch_names X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33) lds = LabeledDataStorage(X_test, y_test, w_test) # CLASSIFIER clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False, n_jobs=7)) # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8, # subsample=0.5 # ) # clf_stacking='nn' clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8, stacking=clf_stacking, features_stack=branch_names, transform=False, transform_pred=False) # clf = SklearnClassifier(GaussianNB()) # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1., # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1))
def calibrate_probs(labels, weights, probs, logistic=False, random_state=11, threshold=0., return_calibrator=False, symmetrize=False, inEtaSpace=False, plot=False): """ Calibrate output to probabilities using 2-folding to calibrate all data :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1 :param logistic: bool, use logistic or isotonic regression :param inEtaSpace: bool, do calibration in eta between 0 and 0.5 :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: calibrated probabilities """ labels = (labels > threshold) * 1 ind = numpy.arange(len(probs)) ind_1, ind_2 = train_test_split(ind, random_state=random_state, train_size=0.5) calibrator = LogisticRegression(C=100,solver='sag') if logistic else IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') est_calib_1, est_calib_2 = clone(calibrator), clone(calibrator) probs_1 = probs[ind_1] probs_2 = probs[ind_2] flav_1 = labels[ind_1] flav_2 = labels[ind_2] w1 = weights[ind_1] w2 = weights[ind_2] # Turn 0-1 B+/B- space into 0-0.5 mistagged/tagged space x1 = x2 = y1 = y2 = dllx1 = dllx2 = [] dil_1 = 2*probs_1 - 1 # 0 => -1; 0.5 => 0; 1 => 1 dil_2 = 2*probs_2 - 1 tag_1 = numpy.sign(dil_1) # - => -1; + => +1 tag_2 = numpy.sign(dil_2) eta_1 = 0.5*(1-numpy.abs(dil_1)) # 0 => 0; 0.5 => 0.5; 1 => 0 eta_2 = 0.5*(1-numpy.abs(dil_2)) if False: plt.figure(1,figsize=(6,5)) plt.scatter(probs_1,eta_1) plt.show() if inEtaSpace: x1 = eta_1 x2 = eta_2 y1 = tag_1 != 2*flav_1-1 y2 = tag_2 != 2*flav_2-1 else: if symmetrize: x1 = numpy.r_[probs_1, 1-probs_1] x2 = numpy.r_[probs_2, 1-probs_2] y1 = numpy.r_[flav_1 > 0, flav_1 <= 0] y2 = numpy.r_[flav_2 > 0, flav_2 <= 0] w1 = 0.5*w1 w2 = 0.5*w2 w1 = numpy.r_[w1,w1] w2 = numpy.r_[w2,w2] else: x1 = probs_1 x2 = probs_2 y1 = flav_1>0 y2 = flav_2>0 ## If logistic regression, change x to logit(x) if logistic: if inEtaSpace: x1 = numpy.clip(x1, 0.00001, 0.49999) x2 = numpy.clip(x2, 0.00001, 0.49999) else: x1 = numpy.clip(x1, 0.00001, 0.99999) x2 = numpy.clip(x2, 0.00001, 0.99999) dllx1 = logit(x1)[:, numpy.newaxis] dllx2 = logit(x2)[:, numpy.newaxis] # Do the fit if logistic: est_calib_1.fit(dllx1,y1,sample_weight=w1) est_calib_2.fit(dllx2,y2,sample_weight=w2) else: est_calib_1.fit(x1,y1,w1) est_calib_2.fit(x2,y2,w2) # Plots if plot: X_test = [] if inEtaSpace: X_test = numpy.linspace(0.001,0.499,500) else: X_test = numpy.linspace(0.001,0.999,500) c1 = c2 = [] if logistic: dllX_test = logit(X_test)[:, numpy.newaxis] c1 = est_calib_1.predict_proba(dllX_test)[:, 1] c2 = est_calib_2.predict_proba(dllX_test)[:, 1] else: c1 = est_calib_1.transform(X_test) c2 = est_calib_2.transform(X_test) tryN = 250 X1 = x1.ravel() N = tryN groups1 = None success = False while not success: try: groups1 = pandas.qcut(X1,N,range(N)) success = True except ValueError: N -= 1 plotd1 = pandas.DataFrame({'X' : numpy.multiply(X1,w1), 'g' : groups1, 'y' : numpy.multiply(y1,w1), 'w' : w1 }) grouped1 = plotd1.groupby('g') gsum1 = grouped1.sum() gsum1['X'] = gsum1['X'] / gsum1['w'] gsum1['y'] = gsum1['y'] / gsum1['w'] X2 = x2.ravel() N = tryN groups2 = None success = False while not success: try: groups2 = pandas.qcut(X2,N,range(N)) success = True except ValueError: N -= 1 plotd2 = pandas.DataFrame({'X' : numpy.multiply(X2,w2), 'g' : groups2, 'y' : numpy.multiply(y2,w2), 'w' : w2 }) grouped2 = plotd2.groupby('g') gsum2 = grouped2.sum() gsum2['X'] = gsum2['X'] / gsum2['w'] gsum2['y'] = gsum2['y'] / gsum2['w'] plt.figure(1,figsize=(12,5)) plt.subplot(1,2,1) plt.scatter(x1.ravel(), y1, color='black', zorder=20) plt.scatter(gsum1['X'], gsum1['y'], color='red', zorder=20) plt.plot(X_test, c1, color='blue', linewidth=3) plt.subplot(1,2,2) plt.scatter(x2.ravel(), y2, color='black', zorder=20) plt.scatter(gsum2['X'], gsum2['y'], color='red', zorder=20) plt.plot(X_test, c2, color='blue', linewidth=3) plt.show() # Cross validate p1 = p2 = [] if logistic: p1 = est_calib_2.predict_proba(dllx1)[:, 1] p2 = est_calib_1.predict_proba(dllx2)[:, 1] else: p1 = est_calib_2.transform(x1) p2 = est_calib_1.transform(x2) # Transform back to flav space if inEtaSpace: pdil_1 = (1-2*p1)*tag_1 pdil_2 = (1-2*p2)*tag_2 p1 = 0.5*(1+pdil_1) p2 = 0.5*(1+pdil_2) # Save calibrated_probs = numpy.zeros(len(probs)) calibrated_probs[ind_1] = p1 calibrated_probs[ind_2] = p2 # Return alpha = (1 - 2 * calibrated_probs) ** 2 D2 = numpy.average(alpha,weights=weights) if return_calibrator: return calibrated_probs, D2, (est_calib_1, est_calib_2) else: return calibrated_probs, D2
values, bins, _ = plt.hist(signalData8.ix[signalData8.target.values == 1, feature].values, weights= (signalData8.ix[signalData8.target.values == 1, weights].values) , range=(min_value, max_value), label='Benchmark 8 (v1)', **hist_params) areaBKG2 = sum(np.diff(bins)*values) values, bins, _ = plt.hist(dataset.ix[dataset.target.values == 1, feature].values, weights= dataset.ix[dataset.target.values == 1, weights].values , range=(min_value, max_value), label='Signal SM', **hist_params) areaSig = sum(np.diff(bins)*values) #print areaBKG, " ",areaBKG2 ," ",areaSig if n == 0 : plt.legend(loc='best') plt.title(feature) plt.savefig("Variables_"+subset+BKG+"_benchmarks_"+ext) plt.clf() """ ################################################################################# ### Define classifiers to test traindataset, valdataset = train_test_split(dataset, random_state=11, train_size=0.50) traindatasetmix, valdatasetmix = train_test_split(datasetmix, random_state=11, train_size=0.50) ################################################################################# arr = valdatasetmix.to_records() array2root(arr, outputCentral+"_AppliedToMixed"+typedata+".root" , 'tree', 'recreate') arr = dataset.to_records() array2root(arr, outputCentral+"_AppliedToPlain"+typedata+".root" , 'tree', 'recreate') if typedata=="Data": arr = dataset20.to_records() array2root(arr, outputCentral+"_AppliedTo20pOfPlain"+typedata+".root" , 'tree', 'recreate') # for ii in range(0,3): if ii==0 : train= trainFeaturesplot Var='All'