def test_zero_sample_weights_classification():
    # Make sure setting a SW to zero amounts to ignoring the corresponding
    # sample

    X = [[1, 0],
         [1, 0],
         [1, 0],
         [0, 1]]
    y = [0, 0, 1, 0]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1]
    gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                        min_samples_leaf=1)
    gb.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(gb.predict([[1, 0]]), [1])

    X = [[1, 0],
         [1, 0],
         [1, 0],
         [0, 1],
         [1, 1]]
    y = [0, 0, 1, 0, 2]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1, 1]
    gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
                                        min_samples_leaf=1)
    gb.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(gb.predict([[1, 0]]), [1])
Exemplo n.º 2
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    max_bins = 255

    X, y = make_classification(
        n_samples=n_samples,
        n_classes=2,
        n_features=5,
        n_informative=5,
        n_redundant=0,
        random_state=0,
    )

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss="binary_crossentropy",
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
    )
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
Exemplo n.º 3
0
class GradientBoostingMsgClassifierModel(h1.MLModel):
    def load_data(self, num_files=None):
        return utils.load_data(num_files, shuffle=False)

    def prep(self, data):
        def concat_processed_files(files):
            dfs = []
            for f in files:
                z = pd.read_parquet(f)
                z = utils.compute_timediff_fillna(z, dropna_subset=FEATURES)
                dfs.append(z)
            df2 = pd.concat(dfs)
            return df2
        split = int(len(data["attack_files"])*0.5)
        train_files = data["attack_files"][:split]
        test_files = data["attack_files"][split:]
        result = {
            "train_files": train_files,
            "test_files": test_files,
            "train_attack_df": concat_processed_files(train_files),
            "test_attack_df": concat_processed_files(test_files)
        }
        print("len train_attack_df = %s" % len(result["train_attack_df"]))
        print("len test_attack_df = %s" % len(result["test_attack_df"]))
        return result

    def train(self, prepared_data):
        df = prepared_data["train_attack_df"]
        from sklearn.experimental import enable_hist_gradient_boosting
        from sklearn.ensemble import HistGradientBoostingClassifier
        X = df[FEATURES]
        y = df.Label == config.ATTACK_LABEL
        self.base_model = HistGradientBoostingClassifier(max_iter=500).fit(X, y)

    def evaluate(self, prepared_data):
        df = prepared_data["test_attack_df"]
        ypred = self.base_model.predict(df[FEATURES])
        import sklearn.metrics
        cf = sklearn.metrics.confusion_matrix(df.Label == config.ATTACK_LABEL, ypred)
        acc = sklearn.metrics.accuracy_score(df.Label == config.ATTACK_LABEL, ypred)
        print(cf)
        print("Accuracy = %.4f" % acc)
        self.metrics = {"confusion_matrix": cf, "accuracy": acc}
    
    def predict(self, data):
        df = data["df"].copy()
        df = utils.compute_timediff_fillna(df)
        df['MsgIsAttack'] = 0
        df['WindowInAttack'] = 0
        for event_result in data["event_detection_results"]:
            if event_result['WindowInAttack']:
                # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result))
                in_window = (df.Timestamp >= event_result['window_start']) & (df.Timestamp < event_result['window_start'] + config.WINDOW_SIZE)
                w_df = df[in_window]
                if len(w_df) > 0:
                    ypred = self.base_model.predict(w_df[FEATURES])
                    df.loc[in_window, "WindowInAttack"] = 1
                    df.loc[in_window, "MsgIsAttack"] = ypred.astype(int)
        return {"injection_window_results": df}
Exemplo n.º 4
0
class GradientBoostingMsgClassifierModel(h1.Model):
    def load_data(self, num_samples=None):
        return util.load_data_daic(num_samples, shuffle=True)

    def prep_data(self, data):
        # concat multiple files into separate training/test pd.DataFrame
        def concat_processed_files(files):
            dfs = []
            for f in files:
                z = pd.read_csv(f)
                z.columns = ['Timestamp', 'Label', 'CarSpeed', 'SteeringAngle', 'YawRate', 'Gx', 'Gy',]
                z = util.compute_timediff_fillna(z)
                dfs.append(z)
            df2 = pd.concat(dfs)
            return df2
        return {
            "train_attack_df": concat_processed_files(data["train_attack_files"]),
            "test_attack_df": concat_processed_files(data["test_attack_files"])
        }

    def train(self, prepared_data):
        df = prepared_data["train_attack_df"]
        from sklearn.experimental import enable_hist_gradient_boosting
        from sklearn.ensemble import HistGradientBoostingClassifier
        X = df[FEATURES]
        y = df.Label == "Tx"
        self.model = HistGradientBoostingClassifier(max_iter=500).fit(X, y)

    def evaluate(self, data):        
        df = prepared_data["test_attack_df"]
        ypred = self.model.predict(df[FEATURES])
        import sklearn.metrics
        cf = sklearn.metrics.confusion_matrix(df.Label == "Tx", ypred)
        acc = sklearn.metrics.accuracy_score(df.Label == "Tx", ypred)
        print(cf)
        print("Accuracy = %.4f" % acc)
        self.metrics = {"confusion_matrix": cf, "accuracy": acc}
    
    def predict(self, data):
        df = data["df"].copy()
        df = util.compute_timediff_fillna(df)
        df['MsgIsAttack'] = 0
        df['WindowInAttack'] = 0
        for event_result in data["event_detection_results"]:
            if event_result['WindowInAttack']:
                # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result))
                in_window = (df.Timestamp >= event_result['window_start']) & (df.Timestamp < event_result['window_start'] + WINDOW_SIZE)
                w_df = df[in_window]
                ypred = self.model.predict(w_df[FEATURES])
                df.loc[in_window, "WindowInAttack"] = 1
                df.loc[in_window, "MsgIsAttack"] = ypred.astype(int)
                return {"injection_window_results": df}
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                               n_informative=5, n_redundant=0, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='binary_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
Exemplo n.º 6
0
def k_fold_cross_val(Xs, y_var, k=10):
    clf = tree.DecisionTreeClassifier()
    clf_forest = RandomForestClassifier(n_estimators=10)
    clf_boost = HistGradientBoostingClassifier()

    num_folds = k
    N = Xs.shape[0]
    test_size = int(N / num_folds)

    test_idxs = np.random.permutation(N)[:num_folds * test_size].reshape(
        num_folds, test_size)

    total_score = np.asarray([0., 0., 0.])
    total_F1_score = np.asarray([0., 0., 0.])

    for i in range(num_folds):
        print("Iteration " + str(i) + ":")
        test_i = Xs.index.isin(test_idxs[i])
        df_train, df_test = Xs[~test_i], Xs[test_i]
        y_train, y_test = y_var[~test_i], y_var[test_i]

        clf = clf.fit(df_train.to_numpy(), y_train.to_numpy().ravel())
        score_b = clf.score(df_test.to_numpy(), y_test.to_numpy().ravel())

        clf_forest = clf_forest.fit(df_train.to_numpy(),
                                    y_train.to_numpy().ravel())
        score_f = clf_forest.score(df_test.to_numpy(),
                                   y_test.to_numpy().ravel())

        clf_boost = clf_boost.fit(df_train.to_numpy(),
                                  y_train.to_numpy().ravel())
        score_h = clf_boost.score(df_test.to_numpy(),
                                  y_test.to_numpy().ravel())

        y_hat = clf.predict(df_test.to_numpy())
        f1_b = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (tree):", f1_b)

        y_hat = clf_forest.predict(df_test.to_numpy())
        f1_f = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (forest):", f1_f)

        y_hat = clf_boost.predict(df_test.to_numpy())
        f1_boost = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (boost):", f1_boost)

        print("Prediction scores for (tree,forest,boost):", score_b, score_f,
              score_h)
        total_score += np.asarray([score_b, score_f, score_h])
        total_F1_score += np.asarray([f1_b, f1_f, f1_boost])

    print("Avg. accuracy scores for (tree,forest,boost):",
          total_score / num_folds)
    print("Avg. F1 scores for (tree,forest,boost):",
          total_F1_score / num_folds)

    return clf, clf_forest, clf_boost
def k_fold_trainning(rawdata,n_folds=5):

    cv = StratifiedKFold(n_splits=n_folds,shuffle=True)
    target = np.array(rawdata[0].values)
    lure = np.array(rawdata[1].values)
    y = np.array(rawdata['label'].values)

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()
    for i, (train, test) in enumerate(cv.split(target, lure, y)):
        print('----------------Training Fold %d---------------'%(i+1))
        X_train = pd.DataFrame({0:target[train],1:lure[train]})
        X_test = pd.DataFrame({0:target[test],1:lure[test]})
        pmfm = create_pmfm(X_train,y[train])
        train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100)
        train_data = np.matrix([train_feature[i] for i in range(train_feature.shape[0])])
        test_data = np.matrix([test_feature[i] for i in range(test_feature.shape[0])])
        clf.fit(train_data, y[train])
        pred = clf.predict(test_data)
        evaluate(y[test], pred)
        viz = plot_roc_curve(clf, test_data, y[test],
                            name='ROC fold {}'.format(i+1),
                            alpha=0.5, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
        title="Receiver operating characteristic Curve")
    ax.legend(loc="lower right")
    plt.savefig('roc.png',dpi=300)
Exemplo n.º 8
0
Arquivo: p4.py Projeto: i72sijia/IMD
def clasificar_HistGradientBoostingClassifier(X, y, df, trainInputs,
                                              trainOutputs, testInputs,
                                              testOutputs, graphname):
    print("\n[" + str(graphname) + "]")
    scoreArray = np.array([])
    clf = HistGradientBoostingClassifier()
    scores = cross_val_score(clf, X, y, cv=10)
    clf = clf.fit(trainInputs, trainOutputs)
    precisionTrain = clf.score(trainInputs, trainOutputs)
    precisionTest = clf.score(testInputs, testOutputs)
    print("\tCCR train = %.2f%% | CCR test = %.2f%%" %
          (precisionTrain * 100, precisionTest * 100))
    prediccion_test = clf.predict(testInputs)
    print(prediccion_test)
    print(testOutputs)
    return precisionTest
Exemplo n.º 9
0
def gradient_boost(train_data, test_data):
    train_y = train_data['state']
    train_X = train_data.iloc[:, FEATURES_INDICES]

    test_y = test_data['state']
    test_X = test_data.iloc[:, FEATURES_INDICES]

    #search(train_X, train_y)
    #search_xgboost(train_X, train_y)
    gd = HistGradientBoostingClassifier(loss='auto',
                                        max_bins=200,
                                        max_depth=10,
                                        max_leaf_nodes=35)

    #gd = XGBClassifier()
    gd.fit(train_X, train_y)

    pred_y = gd.predict(test_X)
    evaluate(gd, test_X, test_y, pred_y)
Exemplo n.º 10
0
def main():
    # loading the dataset from sklearn.datasets
    df_cancer = load_breast_cancer()
    print(df_cancer.keys())
    X = df_cancer.data
    y = df_cancer.target
    print("number of classes are: ", np.unique(y))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    # create object of historgradientboosting
    hist = HistGradientBoostingClassifier()
    # training the model
    hist.fit(X_train, y_train)
    y_pred = hist.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy of the model is: ", accuracy)
    clr = classification_report(y_test, y_pred)
    print("Classification report is:", clr)
def test_on_target(rawdata, sitename):
    print('------------Testing on %s-----------' % sitename)
    target_info = pd.read_csv("target_info.csv")
    if sitename in target_info['Site'].values:
        target_dict = target_info.set_index('Site').T.to_dict()
        sequence = target_dict[sitename]['Sequence']
        train_data = rawdata[rawdata[0]!=sequence]
        test_data = rawdata[rawdata[0]==sequence]
        X_train, y_train = create_Input(train_data)
        X_test, y_test = create_Input(test_data)
        pmfm = create_pmfm(X_train,y_train)
        train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100)
        train_matrix = np.matrix([train_feature[i] for i in range(train_feature.shape[0])])
        test_matrix = np.matrix([test_feature[i] for i in range(test_feature.shape[0])])
        clf.fit(train_matrix, y_train)
        pred = clf.predict(test_matrix)
        evaluate(y_test, pred)
    else:
        print('ERROR: INCORRECT SITE NAME')
Exemplo n.º 12
0
def automatedHistGB(train_X, train_y, test_X, test_y):
    """Executes Histogram-based Gradient Boosting Classifier.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedHistGB', 'Starting')
    param_grid = {'max_iter': [1000, 1200, 1500],
                  'learning_rate': [0.1],
                  'max_depth': [25, 50, 75]}
    model = HistGradientBoostingClassifier()
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
        n_estimators = n_trees // n_classes if objective == 'categorical_crossentropy' else n_trees
        print("Currently processing {}...".format(dataset))
        
        model = HistGradientBoostingClassifier(
            max_iter=n_estimators,
            loss=objective,
            validation_fraction=None
        )

        tic = time.time()
        model.fit(X_train, y_train)
        toc = time.time()
        training_time = toc - tic

        tic = time.time()
        y_pred = model.predict(X_test)
        toc = time.time()
        testing_time = toc - tic
        
        testing_acc = accuracy_score(y_test, y_pred)
        
        records.append((dataset, training_time, testing_time, testing_acc))

    # Write a log file
    with open("all_hgbdt_classification.txt", 'w') as file:
        for dataset, training_time, testing_time, testing_acc in records:
            string = "{}\t{:.5f}\t{:.5f}\t{:.5f}\n".format(
                dataset, training_time, testing_time, testing_acc)
            file.write(string)
        file.close()
        
Exemplo n.º 14
0
                                             max_features=max_features,
                                             verbose=0,
                                             warm_start=warm_start,
                                             presort='deprecated')
        clf.fit(X_train, y_train)

        print("n_estimators : ", n_estimators)
        print("learning rate  :", lr)
        print("Accuracy score (training): {0:.3f}".format(
            clf.score(X_train, y_train)))
        print("Accuracy score (validation): {0:.3f}".format(
            clf.score(X_test, y_test)))
        print("\n")

        filename = 'LGBM' + str(n_estimators) + str(lr) + str(
            max_depth) + str(max_features) + str(warm_start) + str(
                clf.score(X_test, y_test)) + "%" + '.sav'
        if clf.score(X_test, y_test) > 0.93:
            pickle.dump(clf, open(filename, 'wb'))

y_predict = clf.predict(X_test)
score = accuracy_score(y_test, y_predict)
print("n_estimators = ", n_estimators)
print("max_features = ", max_features)
print("warm_start = ", warm_start)
print(score)
print(confusion_matrix(y_test, y_predict))

filename = 'model.sav'
pickle.dump(clf, open(filename, 'wb'))
Exemplo n.º 15
0
n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     early_stopping=False,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_est.predict(data_test)
    predicted_proba_test = lightgbm_est.predict_proba(data_test)
    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
    acc = accuracy_score(target_test, predicted_test)
Exemplo n.º 16
0
col = [
    'acc', 'UP_precision', 'DOWN_precision', 'PRESERVE_precision', 'UP_recall',
    'DOWN_recall', 'PRESERVE_recall'
]
res = pd.DataFrame()
res = res.append(pd.DataFrame([accuracy_score(test['label'], pred_test)] + \
            list(precision_score(test['label'],pred_test, average = None)) + \
            list(recall_score(test['label'],pred_test, average = None))).transpose())

# lightgbm
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
hist = HistGradientBoostingClassifier(random_state=0)
hist.fit(train.iloc[:, :6780], train['label'])
from sklearn.metrics import accuracy_score
pred_train = hist.predict(train.iloc[:, :6780])
print(f"Accuracy in train set: {accuracy_score(train['label'], pred_train)}")
pred_cv = hist.predict(cv.iloc[:, :6780])
print(f"Accuracy in valid set: {accuracy_score(cv['label'], pred_cv)}")
pred_test = hist.predict(test.iloc[:, :6780])
print(f"Accuracy in test set: {accuracy_score(test['label'], pred_test)}")
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(test['label'], pred_test)
plotCM(['UP', 'DOWN', 'PRESERVE'], confusion_mat, 'hist_confusion_matrix')
res = res.append(pd.DataFrame([accuracy_score(test['label'], pred_test)] + \
            list(precision_score(test['label'],pred_test, average = None)) + \
            list(recall_score(test['label'],pred_test, average = None))).transpose())

# mlp
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=0)
clf_bc = BaggingClassifier(base_estimator=SVC(),
                           n_estimators=10,
                           random_state=0)
clf_bc.fit(x_train, y_train)
bc_pred = clf_bc.predict(x_test)
bc_matrices = evaluate_preds(clf_bc, x_test, y_test, bc_pred)
# ################################################ ExtraTreesClassifier
clf_etc = ExtraTreesClassifier()
clf_etc.fit(x_train, y_train)
etc_pred = clf_etc.predict(x_test)
et_matrices = evaluate_preds(clf_etc, x_test, y_test, etc_pred)
# ############################################################
# ############################################################ HistGradientBoostingClassifier
clf_hgbc = HistGradientBoostingClassifier()
clf_hgbc.fit(x_train, y_train)
hgbc_pred = clf_hgbc.predict(x_test)
hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred)
# ############################################################
# ############################################################ LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(x_train, y_train)
clf_pred = clf_lr.predict(x_test)
lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred)
# ############################################################
# ############################################################ StackingClassifier
clf_sc = StackingClassifier(estimators=estimators,
                            final_estimator=LogisticRegression())
clf_sc.fit(x_train, y_train)
clf_pred = clf_sc.predict(x_test)
sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred)
# ############################################################
Exemplo n.º 18
0
print("recall")
print(recall)
print("f1score")
print(f1)
print("Confusion Matrix(Multilabel):")
print(sm.multilabel_confusion_matrix(y_test, y_predict))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predict))
print("Classification Report:")
print(classification_report(y_test, y_predict))
"""
#HISTOGRAMBOOSTINGCLASSIFIER
print("HISTOGRAMBOOSTING_CLASSIFIER:")
Hgb = HistGradientBoostingClassifier()
Hgb.fit(x_train, y_train)
hgb_predict = Hgb.predict(x_test)
#print(y_test.head())
#print(hgb_predict)
acc = r2_score(y_test, hgb_predict)
accuracy = accuracy_score(y_test, hgb_predict)
recall = recall_score(y_test, hgb_predict, average='macro')
precision = precision_score(y_test,
                            hgb_predict,
                            pos_label=1,
                            average='macro',
                            sample_weight=None,
                            zero_division=0)
f1 = f1_score(y_test, hgb_predict, average='macro')
print("Histogram Gradient Boosting Classifier(r2_score):-")
print(acc)
print("Accuracy:")
    print("Evaluating classifiers...")

    print("#" * 128)
    print("Gradient Boosting Classifier:")
    print("Test:")
    print(metrics.classification_report(y_test, t.predict(X_test)))
    print(metrics.confusion_matrix(y_test, t.predict(X_test)))
    print("Training:")
    print(metrics.classification_report(y_train, t.predict(X_train)))
    print(metrics.confusion_matrix(y_train, t.predict(X_train)))

    print("#" * 128)
    print("Hist Gradient Boosting Classifier:")
    print("Test:")
    print(metrics.classification_report(y_test, e.predict(X_test)))
    print(metrics.confusion_matrix(y_test, e.predict(X_test)))
    print("Training:")
    print(metrics.classification_report(y_train, e.predict(X_train)))
    print(metrics.confusion_matrix(y_train, e.predict(X_train)))

    print("#" * 128)
    print("LightGBM Classifier:")
    p = lgb_model.predict(X_test)
    predictions = []

    for x in p:
        predictions.append(np.argmax(x))
    print("Test:")
    print(metrics.classification_report(y_test, predictions))
    print(metrics.confusion_matrix(y_test, predictions))
Exemplo n.º 20
0
def test_same_predictions_multiclass_classification(
        seed, min_samples_leaf, n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256
    lr = 1

    X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
                               n_informative=5, n_redundant=0,
                               n_clusters_per_class=1, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='categorical_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=lr,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_sklearn = est_sklearn.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_sklearn = est_sklearn.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     n_iter_no_change=None,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_est.predict(data_test)
    predicted_proba_test = lightgbm_est.predict_proba(data_test)
    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
    acc = accuracy_score(target_test, predicted_test)
Exemplo n.º 22
0
    min_samples_leaf=msl,
),
                        param_grid=param6,
                        scoring=scoring,
                        n_jobs=-1,
                        cv=5)
gsearch6.fit(X_train, y_train)
print('best_params:{0}  best_score:{1}'.format(gsearch6.best_params_,
                                               gsearch6.best_score_))
l2r = gsearch6.best_params_['l2_regularization']
# best_params:{'l2_regularization': 0.30000000000000004}  best_score:0.9780450886460196

hgdbt = HistGradientBoostingClassifier(random_state=10,
                                       learning_rate=lr,
                                       max_iter=mi,
                                       max_leaf_nodes=mln,
                                       max_depth=md,
                                       min_samples_leaf=msl,
                                       l2_regularization=l2r)
hgdbt.fit(X_train, y_train)
y_pred = hgdbt.predict(X_test)
c_m = metrics.confusion_matrix(y_test, y_pred)
print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format(c_m[0][0], c_m[1][0],
                                                    c_m[1][1], c_m[0][1]))
print("召回率:%.4f" % metrics.recall_score(y_test, y_pred))
print("查准率:%.4f" % metrics.precision_score(y_test, y_pred))
print("F1:%.4f" % metrics.f1_score(y_test, y_pred))
print("roc_auc:%.4f" % metrics.roc_auc_score(y_test, y_pred))
print("F-measure:%.4f" % (metrics.recall_score(y_test, y_pred) *
                          metrics.precision_score(y_test, y_pred)))
Exemplo n.º 23
0
                           scoring='roc_auc',
                           cv = 3,
                           verbose = 10,
                           n_jobs = -1)

start_time = time.time()
grid_search = grid_search.fit(X_train, y_train)
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
grid_search.best_params_, grid_search.best_score_


# last step
clf_hgb = grid_search.best_estimator_
clf_hgb.fit(X_train, y_train)

y_pred = clf_hgb.predict(X_test)
print(classification_report(y_test, y_pred))

y_pred = clf_hgb.predict_proba(X_test)[:, 1]
print('HGB AUC_ROC: %.3f' % roc_auc_score(y_test, y_pred))


# KF & RS
parameters = {'learning_rate': uniform(0,0.1), 
              'max_depth':sp_randint(3, 11),
              'max_leaf_nodes':sp_randint(2, 32),
              'min_samples_leaf':sp_randint(1, 11),
              'max_iter':[400,600,800,1000,1200],
              'l2_regularization':uniform(0,0.1)}

rand_search = RandomizedSearchCV(estimator = clf_hgb,
Exemplo n.º 24
0
x_test = scaler.transform(x_test)

# 모델 구성
model = HistGradientBoostingClassifier(verbose=1, random_state=42, validation_fraction=0.2)
model.fit(x_train, y_train)

# model & weight save
pickle.dump(model, open('C:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\HBC_4_val2.data', 'wb')) # wb : write
print("== save complete ==")

# model load
# model = pickle.load(open('C:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\HBC_4_val2.data', 'rb'))  # rb : read
# time >>  

# evaluate
y_pred = model.predict(x_test)
# print(y_pred[:100])
# print(y_pred[100:])

accuracy = accuracy_score(y_test, y_pred)
log_loss = log_loss(y_test, y_pred)

print("log_loss : \t", log_loss)                        # Cross-entropy loss와 유사한 개념
print("accuracy : \t", accuracy)

pred = ['C:\\nmb\\nmb_data\\5s_last_0510\\predict_04_26\\F', 'C:\\nmb\\nmb_data\\5s_last_0510\\predict_04_26\\M']

count_f = 0
count_m = 0

for pred_pathAudio in pred:
Exemplo n.º 25
0
        print(
            classification_report(y_test,
                                  y_pred,
                                  target_names=["water", "floating objects"]))

        #### Hist-based Gradient Boosting Classifier ####
        from sklearn.experimental import enable_hist_gradient_boosting  # noqa
        from sklearn.ensemble import HistGradientBoostingClassifier
        #x,y = draw_N_datapoints(dataset, N=1000)
        clf_hgb = HistGradientBoostingClassifier()
        X_train, X_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        clf_hgb.fit(X_train, y_train)
        y_pred = clf_hgb.predict(X_test)
        print(
            classification_report(y_test,
                                  y_pred,
                                  target_names=["water", "floating objects"]))

    ######## Trained model
    # path to the model
    #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_24_12_2020.pth.tar'
    #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_19_01_2021.pth.tar'
    #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_ratio10_22_01_2021.pth.tar'
    model_path = f'models/{net}-cross-val-2fold/model_{seed}.pth.tar'
    print(model_path)

    #model = UNet(n_channels=12, n_classes=1, bilinear=False).to(device)
    model = get_model(net, inchannels=12).to(device)
Exemplo n.º 26
0
    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1129)
    eval_reg.fit(X_train.copy(), y_train_adr.copy())
    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(
        y_test_adr.copy(), eval_reg.predict(X_test.copy()), X_test.shape[1]
    )
    print(report)

    # eval_clf = RandomForestClassifier(random_state=1129)
    eval_clf = HistGradientBoostingClassifier(random_state=1129)
    eval_clf.fit(X_train.copy(), y_train_canceled.copy())
    print("-" * 10, "classification report", "-" * 10)
    report = classification_report(
        y_test_canceled.copy(), eval_clf.predict(X_test.copy())
    )
    print(report)

    #%%
    pred_df = predict(eval_clf, eval_reg, X_test_df)
    pred_label_df = data.to_label(pred_df)
    label_df = data.get_true_label(columns=["adr", "revenue", "is_canceled", "label"])

    print("[ label evaluation ]")
    report_label = evaluate_by_label(pred_label_df, label_df, target="label")
    print(report_label)
    print("[ revenue_per_day evaluation ]")
    report_revenue = evaluate_by_label(pred_label_df, label_df, target="revenue")
    print(report_revenue)
Exemplo n.º 27
0
print("recall")
print(recall)
print("f1score")
print(f1)
print("Confusion Matrix(Multilabel):")
print(sm.multilabel_confusion_matrix(y_test, y_predict))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predict))
print("Classification Report:")
print(classification_report(y_test, y_predict))
"""
#HISTOGRAMBOOSTINGCLASSIFIER
print("HISTOGRAMBOOSTING_CLASSIFIER:")
Hgb = HistGradientBoostingClassifier()
Hgb.fit(x_train, y_train)
hgb_predict = Hgb.predict(x_test)
#print(y_test.head())
#print(hgb_predict)
acc = r2_score(y_test, hgb_predict)
accuracy = accuracy_score(y_test, hgb_predict)
recall = recall_score(y_test, hgb_predict, average='macro')
precision = precision_score(y_test,
                            hgb_predict,
                            pos_label=1,
                            average='macro',
                            sample_weight=None,
                            zero_division=0)
f1 = f1_score(y_test, hgb_predict, average='macro')
print("Histogram Gradient Boosting Classifier(r2_score):-")
print(acc)
print("Accuracy:")
df[(df.Timestamp >= 200) & (df.Timestamp <= 330)].YawRate.dropna().plot()
plt.title("An period with both normal and attacks of YawRate, can you tell which is which?")
plt.show()

df[(df.Timestamp > 315) & (df.Timestamp < 316)].YawRate.dropna().plot()
plt.title("An attack window on YawRate, zooming in to show zig-zagging between real vs injected values ")
plt.show()

Let’s try a gradient-boosted trees firstly, e.g. sklearn’s HistGradientBoostingClassifier can work well on larger dataset before bringing out bigger guns.

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

gbc = HistGradientBoostingClassifier(max_iter=500).fit(df[FEATURES], df.Label == "Attack")

ypred = gbc.predict(df2[FEATURES])

cf = sklearn.metrics.confusion_matrix(df2.Label == "Attack", ypred)
print(sklearn.metrics.accuracy_score(df2.Label == "Attack", ypred))
print(cf)

print("Accuracy = %s " % sklearn.metrics.accuracy_score(df2.Label == "Attack", ypred))

### 2c. Deep Learning and using a H1ST Model API, organizing, importing, saving & loading

We can bring out larger guns like Bidirectional LSTM or CNN or Transformers which can work well on pattern recognition problems on sequential data such as this one. One such model is available in the full tutorial source code package, and it can reach quite impressive accuracy.

Let's see how we could use it!

import h1st as h1
h1.init()
Exemplo n.º 29
0
    categorical_columns = [
        column for column in categorical_columns
        if column != target_column_name
    ]
    model = CatBoostClassifier(cat_features=categorical_columns,
                               grow_policy='Lossguide',
                               learning_rate=0.1,
                               n_estimators=100,
                               num_leaves=255,
                               train_dir='data/catboost_info',
                               verbose=False)
    model.fit(features_train, labels_train, silent=True)

# Make predictions on the test data.
if args.library == 'h2o':
    predictions_proba = model.predict(data_test).as_data_frame()['Y']
else:
    predictions_proba = model.predict_proba(features_test)[:, 1]

# Compute metrics.
auc_roc = roc_auc_score(labels_test, predictions_proba)

# Compute memory usage.
f = open("/proc/self/status", "r")
for line in f.readlines():
    if line.startswith("VmHWM"):
        memory = line.split(":")[1].strip()

print(json.dumps({
    'auc_roc': auc_roc,
    'memory': memory,
Exemplo n.º 30
0
def HGB():
    actions = [
        'change_lane', 'pull_over', 'slow', 'stop', 'straight', 'turn_left',
        'turn_right', 'wait_to_turn_left'
    ]

    data_points = []
    with open('/Users/zephyryau/Documents/study/INF552/Project/data.csv',
              'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                data_points.append(sample)

    data_points_xycl = np.array(data_points)
    data_points_xyc = data_points_xycl[:, :-1]
    y = data_points_xycl[:, -1]

    # centralize datapoints and normalize
    data_points_xy_cent = []
    for row in data_points_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        data_points_xy_cent.append(new_row)

    result_point = []
    with open(
            '/Users/zephyryau/Documents/study/INF552/Project/input_picture/result.csv',
            'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                result_point.append(sample)

    result_point_xycl = np.array(result_point)
    result_point_xyc = result_point_xycl[:, :-1]
    result_point_y = result_point_xycl[:, -1]

    result_point_xy_cent = []
    for row in result_point_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        result_point_xy_cent.append(new_row)
    '''sum = 0
    gesture_results = []
    for i in range(100):
        data_points_xy_train, data_points_xy_test, y_train, y_test = train_test_split(data_points_xy_cent, y, test_size=0.3)
        clf = MLPClassifier(hidden_layer_sizes=(512,))
        clf.fit(data_points_xy_train, y_train)
        gesture_results.append(clf.predict([result_point_xy_cent[0]])[0])
        score = clf.score(data_points_xy_test, y_test)
        #print(score)
        sum += score'''

    X_train, X_test, y_train, y_test = train_test_split(data_points_xy_cent,
                                                        y,
                                                        test_size=0.4)
    scaler = preprocessing.StandardScaler().fit(X_train)
    #print(scaler.mean_)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    r_X_scaled = scaler.transform(result_point_xy_cent)

    sum = 0
    clf = HistGradientBoostingClassifier()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            data_points_xy_cent, y, test_size=0.4)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        clf.fit(X_train_scaled, y_train)
        #print(clf.n_iter_, end=" ")=
        score_train = clf.score(X_train_scaled, y_train)
        #print(score_train, end=" ")
        score_test = clf.score(X_test_scaled, y_test)
        sum += score_test  #print(score_test)

    tf = (clf.predict([r_X_scaled[0]])[0] == result_point_y[0])

    return clf.predict([r_X_scaled[0]])[0], sum / 10, tf
# -------------------------------------------------------
#
# The :class:`ensemble.HistGradientBoostingClassifier`
# and :class:`ensemble.HistGradientBoostingRegressor` now have native
# support for missing values (NaNs). This means that there is no need for
# imputing data when training or predicting.

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np

X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
y = [0, 0, 1, 1]

gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
print(gbdt.predict(X))

# %%
# Precomputed sparse nearest neighbors graph
# ------------------------------------------
# Most estimators based on nearest neighbors graphs now accept precomputed
# sparse graphs as input, to reuse the same graph for multiple estimator fits.
# To use this feature in a pipeline, one can use the `memory` parameter, along
# with one of the two new transformers,
# :class:`neighbors.KNeighborsTransformer` and
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
# can also be performed by custom estimators to use alternative
# implementations, such as approximate nearest neighbors methods.
# See more details in the :ref:`User Guide <neighbors_transformer>`.

from tempfile import TemporaryDirectory
        p = clf.predict_proba(
            data.loc[:,
                     data.columns != 'Label'])[:,
                                               1]  # prob for class=1 (target)
        p = pd.DataFrame({'p-value': p})
        data.reset_index(drop=True, inplace=True)
        p.reset_index(drop=True, inplace=True)
        data2 = pd.concat([data, p], axis=1)
        data2 = calcQ(data2, scoreColName="p-value")
        data2["Rank"] = 1
        # store best fit
        nXLauc, XLauc = evalXL(data2, plot=False, maxQ=0.1)
        print("pAUC(peptides), pAUC(XLs): " + str(nXLauc) + "\t" + str(XLauc))
        print("sum(pAUC): " + str(nXLauc + XLauc))
        print("Confusion matrix:")
        print(confusion_matrix(y, clf.predict(X)))

        if nXLauc + 10.0 * XLauc > best_nXLauc + 10.0 * best_XLauc:  # we weight XL auc higher than peptide auc
            best_nXLauc = nXLauc
            best_XLauc = XLauc
            best_alpha = alpha
            best_beta = beta
            best_clf = deepcopy(clf)

print("Best alpha, beta: " + str(alpha) + "\t" + str(beta))
print("pAUC(peptides), pAUC(XLs): " + str(best_nXLauc) + "\t" +
      str(best_XLauc))
print("sum(pAUC): " + str(best_nXLauc + best_XLauc))

p = best_clf.predict_proba(
    data.loc[:, data.columns != 'Label'])[:, 1]  # prob for class=1 (target)