Python train_test_split 예제들, rep.utils.train_test_split Python 예제들

예제 #1

0

파일 보기

파일: utils.py 프로젝트: anuwish/tagging_LHCb

def calibrate_probs(labels, weights, probs, logistic=False, random_state=11, threshold=0., return_calibrator=False, symmetrize=False):
    """
    Calibrate output to probabilities using 2-folding to calibrate all data
    
    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels 
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1 
    :param logistic: bool, use logistic or isotonic regression
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-
    
    :return: calibrated probabilities
    """
    labels = (labels > threshold) * 1
    ind = numpy.arange(len(probs))
    ind_1, ind_2 = train_test_split(ind, random_state=random_state, train_size=0.5)
    
    calibrator = LogisticRegression(C=100) if logistic else IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
    est_calib_1, est_calib_2 = clone(calibrator), clone(calibrator)
    probs_1 = probs[ind_1]
    probs_2 = probs[ind_2]
    
    if logistic:
        probs_1 = numpy.clip(probs_1, 0.001, 0.999)
        probs_2 = numpy.clip(probs_2, 0.001, 0.999)
        probs_1 = logit(probs_1)[:, numpy.newaxis]
        probs_2 = logit(probs_2)[:, numpy.newaxis]
        if symmetrize:
            est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], 
                            numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0])
            est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], 
                            numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0])
        else:
            est_calib_1.fit(probs_1, labels[ind_1])
            est_calib_2.fit(probs_2, labels[ind_2])
    else:
        if symmetrize:
            est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], 
                            numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0],
                            numpy.r_[weights[ind_1], weights[ind_1]])
            est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], 
                            numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0],
                            numpy.r_[weights[ind_2], weights[ind_2]])
        else:
            est_calib_1.fit(probs_1, labels[ind_1], weights[ind_1])
            est_calib_2.fit(probs_2, labels[ind_2], weights[ind_2])
        
    calibrated_probs = numpy.zeros(len(probs))
    if logistic:
        calibrated_probs[ind_1] = est_calib_2.predict_proba(probs_1)[:, 1]
        calibrated_probs[ind_2] = est_calib_1.predict_proba(probs_2)[:, 1]
    else:
        calibrated_probs[ind_1] = est_calib_2.transform(probs_1)
        calibrated_probs[ind_2] = est_calib_1.transform(probs_2)
    if return_calibrator:
        return calibrated_probs, (est_calib_1, est_calib_2)
    else:
        return calibrated_probs

예제 #2

0

파일 보기

파일: utils.py 프로젝트: tata-antares/tagging_LHCb

 def _compute_inds(self, length):
     ind = numpy.arange(length)
     ind_1, ind_2 = train_test_split(ind, random_state=self.random_state, train_size=0.5)
     return ind_1, ind_2

예제 #3

0

파일 보기

파일: utils.py 프로젝트: anuwish/tagging_LHCb

def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, group_column=None, threshold=0., symmetrize=False):
    """
    Bootstrap isotonic calibration: 
     * randomly divide data into train-test
     * on train isotonic is fitted and applyed to test
     * on test using calibrated probs p(B+) D2 and auc are calculated 
    
    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels 
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1 
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-
    
    :return: D2 array and auc array
    """
    aucs = []
    D2_array = []
    labels = (labels > threshold) * 1
    
    for _ in range(n_calibrations):
        if group_column is not None:
            train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split_group(
                group_column, probs, labels, weights, train_size=0.5)
        else:
            train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split(
                probs, labels, weights, train_size=0.5)
        iso_est = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
        if symmetrize:
            iso_est.fit(numpy.r_[train_probs, 1-train_probs], 
                        numpy.r_[train_labels > 0, train_labels <= 0],
                        numpy.r_[train_weights, train_weights])
        else:
            iso_est.fit(train_probs, train_labels, train_weights)
            
        probs_calib = iso_est.transform(test_probs)
        alpha = (1 - 2 * probs_calib) ** 2
        aucs.append(roc_auc_score(test_labels, test_probs, sample_weight=test_weights))
        D2_array.append(numpy.average(alpha, weights=test_weights))
    return D2_array, aucs

예제 #4

0

파일 보기

파일: estimator.py 프로젝트: efueger/raredecay

        X = pd.concat((signal, backgr))
        y = np.concatenate((np.ones(signal.shape[0]),
                            np.zeros(backgr.shape[0])))
        w = np.ones(len(X))

    if primitiv:
        X = pd.DataFrame({'odin': np.array([2., 2., 2., 2., 3., 3., 2., 3., 8.,
                                            7., 8., 7., 8., 8., 7., 8.]),
                          'dwa': np.array([2.2, 2.1, 2.2, 2.3, 3.1, 3.1, 2.1, 3.2, 8.1,
                                           7.5, 8.2, 7.1, 8.5, 8.2, 7.6, 8.1])
                          })
        y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])
        w = np.ones(16)
        branch_names = ['odin', 'dwa']
    print branch_names
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33)

    lds = LabeledDataStorage(X_test, y_test, w_test)
    # CLASSIFIER
    clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False,
                                                            n_jobs=7))
    # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8,
    #                                 subsample=0.5
    #                                 )
    # clf_stacking='nn'
    clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8,
                stacking=clf_stacking, features_stack=branch_names,
                transform=False, transform_pred=False)
    # clf = SklearnClassifier(GaussianNB())
    # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1.,
    # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1))

예제 #5

0

파일 보기

파일: utils.py 프로젝트: jwimberley/tagging_LHCb

def calibrate_probs(labels, weights, probs, logistic=False, random_state=11, threshold=0., return_calibrator=False, symmetrize=False, inEtaSpace=False, plot=False):
    """
    Calibrate output to probabilities using 2-folding to calibrate all data
    
    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels 
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1 
    :param logistic: bool, use logistic or isotonic regression
    :param inEtaSpace: bool, do calibration in eta between 0 and 0.5    
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-

    :return: calibrated probabilities
    """
    labels = (labels > threshold) * 1
    ind = numpy.arange(len(probs))
    ind_1, ind_2 = train_test_split(ind, random_state=random_state, train_size=0.5)
    
    calibrator = LogisticRegression(C=100,solver='sag') if logistic else IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
    est_calib_1, est_calib_2 = clone(calibrator), clone(calibrator)
    probs_1 = probs[ind_1]
    probs_2 = probs[ind_2]
    flav_1 = labels[ind_1]
    flav_2 = labels[ind_2]
    w1 = weights[ind_1]
    w2 = weights[ind_2]
    
    # Turn 0-1 B+/B- space into 0-0.5 mistagged/tagged space
    x1 = x2 = y1 = y2 = dllx1 = dllx2 = []
    dil_1 = 2*probs_1 - 1 # 0 => -1; 0.5 => 0; 1 => 1
    dil_2 = 2*probs_2 - 1
    tag_1 = numpy.sign(dil_1) # - => -1; + => +1
    tag_2 = numpy.sign(dil_2)
    eta_1 = 0.5*(1-numpy.abs(dil_1)) # 0 => 0; 0.5 => 0.5; 1 => 0
    eta_2 = 0.5*(1-numpy.abs(dil_2))
    if False:
        plt.figure(1,figsize=(6,5))
        plt.scatter(probs_1,eta_1)
        plt.show()
    
    if inEtaSpace:
        x1 = eta_1
        x2 = eta_2
        y1 = tag_1 != 2*flav_1-1
        y2 = tag_2 != 2*flav_2-1
    else:
        if symmetrize:
            x1 = numpy.r_[probs_1, 1-probs_1]
            x2 = numpy.r_[probs_2, 1-probs_2]
            y1 = numpy.r_[flav_1 > 0, flav_1 <= 0]
            y2 = numpy.r_[flav_2 > 0, flav_2 <= 0]
            w1 = 0.5*w1
            w2 = 0.5*w2
            w1 = numpy.r_[w1,w1]
            w2 = numpy.r_[w2,w2]
        else:
            x1 = probs_1
            x2 = probs_2
            y1 = flav_1>0
            y2 = flav_2>0

            
    ## If logistic regression, change x to logit(x)
    if logistic:
        if inEtaSpace:
              x1 = numpy.clip(x1, 0.00001, 0.49999)
              x2 = numpy.clip(x2, 0.00001, 0.49999)
        else:
              x1 = numpy.clip(x1, 0.00001, 0.99999)
              x2 = numpy.clip(x2, 0.00001, 0.99999)
        dllx1 = logit(x1)[:, numpy.newaxis]
        dllx2 = logit(x2)[:, numpy.newaxis]
    
    # Do the fit
    if logistic:
        est_calib_1.fit(dllx1,y1,sample_weight=w1)
        est_calib_2.fit(dllx2,y2,sample_weight=w2)
    else:
        est_calib_1.fit(x1,y1,w1)
        est_calib_2.fit(x2,y2,w2)
        
    # Plots
    if plot:
        X_test = []
        if inEtaSpace:
            X_test = numpy.linspace(0.001,0.499,500)
        else:
            X_test = numpy.linspace(0.001,0.999,500)
            
        c1 = c2 = []
        if logistic:
            dllX_test = logit(X_test)[:, numpy.newaxis]
            c1 = est_calib_1.predict_proba(dllX_test)[:, 1]
            c2 = est_calib_2.predict_proba(dllX_test)[:, 1]
        else:
            c1 = est_calib_1.transform(X_test)
            c2 = est_calib_2.transform(X_test)
                
        tryN = 250

        X1 = x1.ravel()
        N = tryN
        groups1 = None
        success = False
        while not success:
            try:
                groups1 = pandas.qcut(X1,N,range(N))
                success = True
            except ValueError:
                N -= 1
        plotd1 = pandas.DataFrame({'X' : numpy.multiply(X1,w1), 'g' : groups1, 'y' : numpy.multiply(y1,w1), 'w' : w1 })
        grouped1 = plotd1.groupby('g')
        gsum1 = grouped1.sum()
        gsum1['X'] = gsum1['X'] / gsum1['w']
        gsum1['y'] = gsum1['y'] / gsum1['w']

        X2 = x2.ravel()
        N = tryN
        groups2 = None
        success = False
        while not success:
            try:
                groups2 = pandas.qcut(X2,N,range(N))
                success = True
            except ValueError:
                N -= 1
        plotd2 = pandas.DataFrame({'X' : numpy.multiply(X2,w2), 'g' : groups2, 'y' : numpy.multiply(y2,w2), 'w' : w2 })
        grouped2 = plotd2.groupby('g')
        gsum2 = grouped2.sum()
        gsum2['X'] = gsum2['X'] / gsum2['w']
        gsum2['y'] = gsum2['y'] / gsum2['w']

        plt.figure(1,figsize=(12,5))
        plt.subplot(1,2,1)
        plt.scatter(x1.ravel(), y1, color='black', zorder=20)
        plt.scatter(gsum1['X'], gsum1['y'], color='red', zorder=20)
        plt.plot(X_test, c1, color='blue', linewidth=3)
        plt.subplot(1,2,2)
        plt.scatter(x2.ravel(), y2, color='black', zorder=20)
        plt.scatter(gsum2['X'], gsum2['y'], color='red', zorder=20)
        plt.plot(X_test, c2, color='blue', linewidth=3)
        plt.show()
    
    # Cross validate
    p1 = p2 = []
    if logistic:
        p1 = est_calib_2.predict_proba(dllx1)[:, 1]
        p2 = est_calib_1.predict_proba(dllx2)[:, 1]
    else:
        p1 = est_calib_2.transform(x1)
        p2 = est_calib_1.transform(x2)

    # Transform back to flav space
    if inEtaSpace:
        pdil_1 = (1-2*p1)*tag_1
        pdil_2 = (1-2*p2)*tag_2
        p1 = 0.5*(1+pdil_1)
        p2 = 0.5*(1+pdil_2)

    # Save
    calibrated_probs = numpy.zeros(len(probs))
    calibrated_probs[ind_1] = p1
    calibrated_probs[ind_2] = p2

    # Return
    alpha = (1 - 2 * calibrated_probs) ** 2
    D2 = numpy.average(alpha,weights=weights)
    if return_calibrator:
        return calibrated_probs, D2, (est_calib_1, est_calib_2)
    else:
        return calibrated_probs, D2

예제 #6

0

파일 보기

파일: baseline_mixed.py 프로젝트: HH4bHLLHC/Code

    values, bins, _ = plt.hist(signalData8.ix[signalData8.target.values == 1, feature].values,  weights= (signalData8.ix[signalData8.target.values == 1, weights].values) ,  range=(min_value, max_value), label='Benchmark 8 (v1)', **hist_params)
    areaBKG2 = sum(np.diff(bins)*values)   
   
    values, bins, _ = plt.hist(dataset.ix[dataset.target.values == 1, feature].values, weights= dataset.ix[dataset.target.values == 1, weights].values , 
                               range=(min_value, max_value), label='Signal SM', **hist_params)
    areaSig = sum(np.diff(bins)*values)
    
    #print areaBKG, " ",areaBKG2 ," ",areaSig
    if n == 0 : plt.legend(loc='best')
    plt.title(feature)
plt.savefig("Variables_"+subset+BKG+"_benchmarks_"+ext)
plt.clf()
"""
#################################################################################
### Define classifiers to test
traindataset, valdataset = train_test_split(dataset, random_state=11, train_size=0.50)
traindatasetmix, valdatasetmix = train_test_split(datasetmix, random_state=11, train_size=0.50)
#################################################################################
arr = valdatasetmix.to_records()
array2root(arr, outputCentral+"_AppliedToMixed"+typedata+".root" , 'tree', 'recreate') 
arr = dataset.to_records()
array2root(arr, outputCentral+"_AppliedToPlain"+typedata+".root" , 'tree', 'recreate')
if typedata=="Data": 
  arr = dataset20.to_records()
  array2root(arr, outputCentral+"_AppliedTo20pOfPlain"+typedata+".root" , 'tree', 'recreate')
#

for ii in range(0,3):
   if ii==0 :
     train= trainFeaturesplot
     Var='All'