Exemplos de HistGradientBoostingClassifier.predict em Python, exemplos de sklearn.ensemble.HistGradientBoostingClassifier.predict em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_gradient_boosting.py Projeto: zzzzbbbbtttt/scikit-learn

def test_zero_sample_weights_classification():
    # Make sure setting a SW to zero amounts to ignoring the corresponding
    # sample

    X = [[1, 0],
         [1, 0],
         [1, 0],
         [0, 1]]
    y = [0, 0, 1, 0]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1]
    gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                        min_samples_leaf=1)
    gb.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(gb.predict([[1, 0]]), [1])

    X = [[1, 0],
         [1, 0],
         [1, 0],
         [0, 1],
         [1, 1]]
    y = [0, 0, 1, 0, 2]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1, 1]
    gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
                                        min_samples_leaf=1)
    gb.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(gb.predict([[1, 0]]), [1])

Exemplo n.º 2

0

Exibir arquivo

def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    max_bins = 255

    X, y = make_classification(
        n_samples=n_samples,
        n_classes=2,
        n_features=5,
        n_informative=5,
        n_redundant=0,
        random_state=0,
    )

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss="binary_crossentropy",
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
    )
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

Exemplo n.º 3

0

Exibir arquivo

class GradientBoostingMsgClassifierModel(h1.MLModel):
    def load_data(self, num_files=None):
        return utils.load_data(num_files, shuffle=False)

    def prep(self, data):
        def concat_processed_files(files):
            dfs = []
            for f in files:
                z = pd.read_parquet(f)
                z = utils.compute_timediff_fillna(z, dropna_subset=FEATURES)
                dfs.append(z)
            df2 = pd.concat(dfs)
            return df2
        split = int(len(data["attack_files"])*0.5)
        train_files = data["attack_files"][:split]
        test_files = data["attack_files"][split:]
        result = {
            "train_files": train_files,
            "test_files": test_files,
            "train_attack_df": concat_processed_files(train_files),
            "test_attack_df": concat_processed_files(test_files)
        }
        print("len train_attack_df = %s" % len(result["train_attack_df"]))
        print("len test_attack_df = %s" % len(result["test_attack_df"]))
        return result

    def train(self, prepared_data):
        df = prepared_data["train_attack_df"]
        from sklearn.experimental import enable_hist_gradient_boosting
        from sklearn.ensemble import HistGradientBoostingClassifier
        X = df[FEATURES]
        y = df.Label == config.ATTACK_LABEL
        self.base_model = HistGradientBoostingClassifier(max_iter=500).fit(X, y)

    def evaluate(self, prepared_data):
        df = prepared_data["test_attack_df"]
        ypred = self.base_model.predict(df[FEATURES])
        import sklearn.metrics
        cf = sklearn.metrics.confusion_matrix(df.Label == config.ATTACK_LABEL, ypred)
        acc = sklearn.metrics.accuracy_score(df.Label == config.ATTACK_LABEL, ypred)
        print(cf)
        print("Accuracy = %.4f" % acc)
        self.metrics = {"confusion_matrix": cf, "accuracy": acc}
    
    def predict(self, data):
        df = data["df"].copy()
        df = utils.compute_timediff_fillna(df)
        df['MsgIsAttack'] = 0
        df['WindowInAttack'] = 0
        for event_result in data["event_detection_results"]:
            if event_result['WindowInAttack']:
                # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result))
                in_window = (df.Timestamp >= event_result['window_start']) & (df.Timestamp < event_result['window_start'] + config.WINDOW_SIZE)
                w_df = df[in_window]
                if len(w_df) > 0:
                    ypred = self.base_model.predict(w_df[FEATURES])
                    df.loc[in_window, "WindowInAttack"] = 1
                    df.loc[in_window, "MsgIsAttack"] = ypred.astype(int)
        return {"injection_window_results": df}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: Using H1st.AI.py Projeto: tvanh/h1st

class GradientBoostingMsgClassifierModel(h1.Model):
    def load_data(self, num_samples=None):
        return util.load_data_daic(num_samples, shuffle=True)

    def prep_data(self, data):
        # concat multiple files into separate training/test pd.DataFrame
        def concat_processed_files(files):
            dfs = []
            for f in files:
                z = pd.read_csv(f)
                z.columns = ['Timestamp', 'Label', 'CarSpeed', 'SteeringAngle', 'YawRate', 'Gx', 'Gy',]
                z = util.compute_timediff_fillna(z)
                dfs.append(z)
            df2 = pd.concat(dfs)
            return df2
        return {
            "train_attack_df": concat_processed_files(data["train_attack_files"]),
            "test_attack_df": concat_processed_files(data["test_attack_files"])
        }

    def train(self, prepared_data):
        df = prepared_data["train_attack_df"]
        from sklearn.experimental import enable_hist_gradient_boosting
        from sklearn.ensemble import HistGradientBoostingClassifier
        X = df[FEATURES]
        y = df.Label == "Tx"
        self.model = HistGradientBoostingClassifier(max_iter=500).fit(X, y)

    def evaluate(self, data):        
        df = prepared_data["test_attack_df"]
        ypred = self.model.predict(df[FEATURES])
        import sklearn.metrics
        cf = sklearn.metrics.confusion_matrix(df.Label == "Tx", ypred)
        acc = sklearn.metrics.accuracy_score(df.Label == "Tx", ypred)
        print(cf)
        print("Accuracy = %.4f" % acc)
        self.metrics = {"confusion_matrix": cf, "accuracy": acc}
    
    def predict(self, data):
        df = data["df"].copy()
        df = util.compute_timediff_fillna(df)
        df['MsgIsAttack'] = 0
        df['WindowInAttack'] = 0
        for event_result in data["event_detection_results"]:
            if event_result['WindowInAttack']:
                # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result))
                in_window = (df.Timestamp >= event_result['window_start']) & (df.Timestamp < event_result['window_start'] + WINDOW_SIZE)
                w_df = df[in_window]
                ypred = self.model.predict(w_df[FEATURES])
                df.loc[in_window, "WindowInAttack"] = 1
                df.loc[in_window, "MsgIsAttack"] = ypred.astype(int)
                return {"injection_window_results": df}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_compare_lightgbm.py Projeto: daniel-perry/scikit-learn

def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                               n_informative=5, n_redundant=0, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='binary_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: salary_cda_jup.py Projeto: Irfan-Mu3/IrfanSPF51

def k_fold_cross_val(Xs, y_var, k=10):
    clf = tree.DecisionTreeClassifier()
    clf_forest = RandomForestClassifier(n_estimators=10)
    clf_boost = HistGradientBoostingClassifier()

    num_folds = k
    N = Xs.shape[0]
    test_size = int(N / num_folds)

    test_idxs = np.random.permutation(N)[:num_folds * test_size].reshape(
        num_folds, test_size)

    total_score = np.asarray([0., 0., 0.])
    total_F1_score = np.asarray([0., 0., 0.])

    for i in range(num_folds):
        print("Iteration " + str(i) + ":")
        test_i = Xs.index.isin(test_idxs[i])
        df_train, df_test = Xs[~test_i], Xs[test_i]
        y_train, y_test = y_var[~test_i], y_var[test_i]

        clf = clf.fit(df_train.to_numpy(), y_train.to_numpy().ravel())
        score_b = clf.score(df_test.to_numpy(), y_test.to_numpy().ravel())

        clf_forest = clf_forest.fit(df_train.to_numpy(),
                                    y_train.to_numpy().ravel())
        score_f = clf_forest.score(df_test.to_numpy(),
                                   y_test.to_numpy().ravel())

        clf_boost = clf_boost.fit(df_train.to_numpy(),
                                  y_train.to_numpy().ravel())
        score_h = clf_boost.score(df_test.to_numpy(),
                                  y_test.to_numpy().ravel())

        y_hat = clf.predict(df_test.to_numpy())
        f1_b = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (tree):", f1_b)

        y_hat = clf_forest.predict(df_test.to_numpy())
        f1_f = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (forest):", f1_f)

        y_hat = clf_boost.predict(df_test.to_numpy())
        f1_boost = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (boost):", f1_boost)

        print("Prediction scores for (tree,forest,boost):", score_b, score_f,
              score_h)
        total_score += np.asarray([score_b, score_f, score_h])
        total_F1_score += np.asarray([f1_b, f1_f, f1_boost])

    print("Avg. accuracy scores for (tree,forest,boost):",
          total_score / num_folds)
    print("Avg. F1 scores for (tree,forest,boost):",
          total_F1_score / num_folds)

    return clf, clf_forest, clf_boost

Exemplo n.º 7

0

Exibir arquivo

Arquivo: pmfm_model.py Projeto: zhangyumeng1sjtu/2020-SJTU-BioX-Shanghai-Model

def k_fold_trainning(rawdata,n_folds=5):

    cv = StratifiedKFold(n_splits=n_folds,shuffle=True)
    target = np.array(rawdata[0].values)
    lure = np.array(rawdata[1].values)
    y = np.array(rawdata['label'].values)

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()
    for i, (train, test) in enumerate(cv.split(target, lure, y)):
        print('----------------Training Fold %d---------------'%(i+1))
        X_train = pd.DataFrame({0:target[train],1:lure[train]})
        X_test = pd.DataFrame({0:target[test],1:lure[test]})
        pmfm = create_pmfm(X_train,y[train])
        train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100)
        train_data = np.matrix([train_feature[i] for i in range(train_feature.shape[0])])
        test_data = np.matrix([test_feature[i] for i in range(test_feature.shape[0])])
        clf.fit(train_data, y[train])
        pred = clf.predict(test_data)
        evaluate(y[test], pred)
        viz = plot_roc_curve(clf, test_data, y[test],
                            name='ROC fold {}'.format(i+1),
                            alpha=0.5, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
        title="Receiver operating characteristic Curve")
    ax.legend(loc="lower right")
    plt.savefig('roc.png',dpi=300)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: p4.py Projeto: i72sijia/IMD

def clasificar_HistGradientBoostingClassifier(X, y, df, trainInputs,
                                              trainOutputs, testInputs,
                                              testOutputs, graphname):
    print("\n[" + str(graphname) + "]")
    scoreArray = np.array([])
    clf = HistGradientBoostingClassifier()
    scores = cross_val_score(clf, X, y, cv=10)
    clf = clf.fit(trainInputs, trainOutputs)
    precisionTrain = clf.score(trainInputs, trainOutputs)
    precisionTest = clf.score(testInputs, testOutputs)
    print("\tCCR train = %.2f%% | CCR test = %.2f%%" %
          (precisionTrain * 100, precisionTest * 100))
    prediccion_test = clf.predict(testInputs)
    print(prediccion_test)
    print(testOutputs)
    return precisionTest

Exemplo n.º 9

0

Exibir arquivo

def gradient_boost(train_data, test_data):
    train_y = train_data['state']
    train_X = train_data.iloc[:, FEATURES_INDICES]

    test_y = test_data['state']
    test_X = test_data.iloc[:, FEATURES_INDICES]

    #search(train_X, train_y)
    #search_xgboost(train_X, train_y)
    gd = HistGradientBoostingClassifier(loss='auto',
                                        max_bins=200,
                                        max_depth=10,
                                        max_leaf_nodes=35)

    #gd = XGBClassifier()
    gd.fit(train_X, train_y)

    pred_y = gd.predict(test_X)
    evaluate(gd, test_X, test_y, pred_y)

Exemplo n.º 10

0

Exibir arquivo

def main():
    # loading the dataset from sklearn.datasets
    df_cancer = load_breast_cancer()
    print(df_cancer.keys())
    X = df_cancer.data
    y = df_cancer.target
    print("number of classes are: ", np.unique(y))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    # create object of historgradientboosting
    hist = HistGradientBoostingClassifier()
    # training the model
    hist.fit(X_train, y_train)
    y_pred = hist.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy of the model is: ", accuracy)
    clr = classification_report(y_test, y_pred)
    print("Classification report is:", clr)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: pmfm_model.py Projeto: zhangyumeng1sjtu/2020-SJTU-BioX-Shanghai-Model

def test_on_target(rawdata, sitename):
    print('------------Testing on %s-----------' % sitename)
    target_info = pd.read_csv("target_info.csv")
    if sitename in target_info['Site'].values:
        target_dict = target_info.set_index('Site').T.to_dict()
        sequence = target_dict[sitename]['Sequence']
        train_data = rawdata[rawdata[0]!=sequence]
        test_data = rawdata[rawdata[0]==sequence]
        X_train, y_train = create_Input(train_data)
        X_test, y_test = create_Input(test_data)
        pmfm = create_pmfm(X_train,y_train)
        train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100)
        train_matrix = np.matrix([train_feature[i] for i in range(train_feature.shape[0])])
        test_matrix = np.matrix([test_feature[i] for i in range(test_feature.shape[0])])
        clf.fit(train_matrix, y_train)
        pred = clf.predict(test_matrix)
        evaluate(y_test, pred)
    else:
        print('ERROR: INCORRECT SITE NAME')

Exemplo n.º 12

0

Exibir arquivo

def automatedHistGB(train_X, train_y, test_X, test_y):
    """Executes Histogram-based Gradient Boosting Classifier.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedHistGB', 'Starting')
    param_grid = {'max_iter': [1000, 1200, 1500],
                  'learning_rate': [0.1],
                  'max_depth': [25, 50, 75]}
    model = HistGradientBoostingClassifier()
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: benchmark_hgbdt_classification.py Projeto: xuyxu/deep_forest_benchmarks

        n_estimators = n_trees // n_classes if objective == 'categorical_crossentropy' else n_trees
        print("Currently processing {}...".format(dataset))
        
        model = HistGradientBoostingClassifier(
            max_iter=n_estimators,
            loss=objective,
            validation_fraction=None
        )

        tic = time.time()
        model.fit(X_train, y_train)
        toc = time.time()
        training_time = toc - tic

        tic = time.time()
        y_pred = model.predict(X_test)
        toc = time.time()
        testing_time = toc - tic
        
        testing_acc = accuracy_score(y_test, y_pred)
        
        records.append((dataset, training_time, testing_time, testing_acc))

    # Write a log file
    with open("all_hgbdt_classification.txt", 'w') as file:
        for dataset, training_time, testing_time, testing_acc in records:
            string = "{}\t{:.5f}\t{:.5f}\t{:.5f}\n".format(
                dataset, training_time, testing_time, testing_acc)
            file.write(string)
        file.close()

Exemplo n.º 14

0

Exibir arquivo

Arquivo: LGBM.py Projeto: JimD28/ComputerVisionProjet2

                                             max_features=max_features,
                                             verbose=0,
                                             warm_start=warm_start,
                                             presort='deprecated')
        clf.fit(X_train, y_train)

        print("n_estimators : ", n_estimators)
        print("learning rate  :", lr)
        print("Accuracy score (training): {0:.3f}".format(
            clf.score(X_train, y_train)))
        print("Accuracy score (validation): {0:.3f}".format(
            clf.score(X_test, y_test)))
        print("\n")

        filename = 'LGBM' + str(n_estimators) + str(lr) + str(
            max_depth) + str(max_features) + str(warm_start) + str(
                clf.score(X_test, y_test)) + "%" + '.sav'
        if clf.score(X_test, y_test) > 0.93:
            pickle.dump(clf, open(filename, 'wb'))

y_predict = clf.predict(X_test)
score = accuracy_score(y_test, y_predict)
print("n_estimators = ", n_estimators)
print("max_features = ", max_features)
print("warm_start = ", warm_start)
print(score)
print(confusion_matrix(y_test, y_predict))

filename = 'model.sav'
pickle.dump(clf, open(filename, 'wb'))

Exemplo n.º 15

0

Exibir arquivo

n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     early_stopping=False,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_est.predict(data_test)
    predicted_proba_test = lightgbm_est.predict_proba(data_test)
    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
    acc = accuracy_score(target_test, predicted_test)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: baseline.py Projeto: lvksh/2020WN_CS545

col = [
    'acc', 'UP_precision', 'DOWN_precision', 'PRESERVE_precision', 'UP_recall',
    'DOWN_recall', 'PRESERVE_recall'
]
res = pd.DataFrame()
res = res.append(pd.DataFrame([accuracy_score(test['label'], pred_test)] + \
            list(precision_score(test['label'],pred_test, average = None)) + \
            list(recall_score(test['label'],pred_test, average = None))).transpose())

# lightgbm
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
hist = HistGradientBoostingClassifier(random_state=0)
hist.fit(train.iloc[:, :6780], train['label'])
from sklearn.metrics import accuracy_score
pred_train = hist.predict(train.iloc[:, :6780])
print(f"Accuracy in train set: {accuracy_score(train['label'], pred_train)}")
pred_cv = hist.predict(cv.iloc[:, :6780])
print(f"Accuracy in valid set: {accuracy_score(cv['label'], pred_cv)}")
pred_test = hist.predict(test.iloc[:, :6780])
print(f"Accuracy in test set: {accuracy_score(test['label'], pred_test)}")
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(test['label'], pred_test)
plotCM(['UP', 'DOWN', 'PRESERVE'], confusion_mat, 'hist_confusion_matrix')
res = res.append(pd.DataFrame([accuracy_score(test['label'], pred_test)] + \
            list(precision_score(test['label'],pred_test, average = None)) + \
            list(recall_score(test['label'],pred_test, average = None))).transpose())

# mlp
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=0)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: algorithm_ evaluator_v3.py Projeto: babakaskari/LeakDetectionProject

clf_bc = BaggingClassifier(base_estimator=SVC(),
                           n_estimators=10,
                           random_state=0)
clf_bc.fit(x_train, y_train)
bc_pred = clf_bc.predict(x_test)
bc_matrices = evaluate_preds(clf_bc, x_test, y_test, bc_pred)
# ################################################ ExtraTreesClassifier
clf_etc = ExtraTreesClassifier()
clf_etc.fit(x_train, y_train)
etc_pred = clf_etc.predict(x_test)
et_matrices = evaluate_preds(clf_etc, x_test, y_test, etc_pred)
# ############################################################
# ############################################################ HistGradientBoostingClassifier
clf_hgbc = HistGradientBoostingClassifier()
clf_hgbc.fit(x_train, y_train)
hgbc_pred = clf_hgbc.predict(x_test)
hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred)
# ############################################################
# ############################################################ LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(x_train, y_train)
clf_pred = clf_lr.predict(x_test)
lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred)
# ############################################################
# ############################################################ StackingClassifier
clf_sc = StackingClassifier(estimators=estimators,
                            final_estimator=LogisticRegression())
clf_sc.fit(x_train, y_train)
clf_pred = clf_sc.predict(x_test)
sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred)
# ############################################################

Exemplo n.º 18

0

Exibir arquivo

print("recall")
print(recall)
print("f1score")
print(f1)
print("Confusion Matrix(Multilabel):")
print(sm.multilabel_confusion_matrix(y_test, y_predict))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predict))
print("Classification Report:")
print(classification_report(y_test, y_predict))
"""
#HISTOGRAMBOOSTINGCLASSIFIER
print("HISTOGRAMBOOSTING_CLASSIFIER:")
Hgb = HistGradientBoostingClassifier()
Hgb.fit(x_train, y_train)
hgb_predict = Hgb.predict(x_test)
#print(y_test.head())
#print(hgb_predict)
acc = r2_score(y_test, hgb_predict)
accuracy = accuracy_score(y_test, hgb_predict)
recall = recall_score(y_test, hgb_predict, average='macro')
precision = precision_score(y_test,
                            hgb_predict,
                            pos_label=1,
                            average='macro',
                            sample_weight=None,
                            zero_division=0)
f1 = f1_score(y_test, hgb_predict, average='macro')
print("Histogram Gradient Boosting Classifier(r2_score):-")
print(acc)
print("Accuracy:")

Exemplo n.º 19

0

Exibir arquivo

Arquivo: GradientBoosting.py Projeto: kopok2/MachineLearningAlgorithms

    print("Evaluating classifiers...")

    print("#" * 128)
    print("Gradient Boosting Classifier:")
    print("Test:")
    print(metrics.classification_report(y_test, t.predict(X_test)))
    print(metrics.confusion_matrix(y_test, t.predict(X_test)))
    print("Training:")
    print(metrics.classification_report(y_train, t.predict(X_train)))
    print(metrics.confusion_matrix(y_train, t.predict(X_train)))

    print("#" * 128)
    print("Hist Gradient Boosting Classifier:")
    print("Test:")
    print(metrics.classification_report(y_test, e.predict(X_test)))
    print(metrics.confusion_matrix(y_test, e.predict(X_test)))
    print("Training:")
    print(metrics.classification_report(y_train, e.predict(X_train)))
    print(metrics.confusion_matrix(y_train, e.predict(X_train)))

    print("#" * 128)
    print("LightGBM Classifier:")
    p = lgb_model.predict(X_test)
    predictions = []

    for x in p:
        predictions.append(np.argmax(x))
    print("Test:")
    print(metrics.classification_report(y_test, predictions))
    print(metrics.confusion_matrix(y_test, predictions))

Exemplo n.º 20

0

Exibir arquivo

def test_same_predictions_multiclass_classification(
        seed, min_samples_leaf, n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256
    lr = 1

    X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
                               n_informative=5, n_redundant=0,
                               n_clusters_per_class=1, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='categorical_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=lr,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_sklearn = est_sklearn.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_sklearn = est_sklearn.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: bench_hist_gradient_boosting_higgsboson.py Projeto: daniel-perry/scikit-learn

n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     n_iter_no_change=None,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_est.predict(data_test)
    predicted_proba_test = lightgbm_est.predict_proba(data_test)
    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
    acc = accuracy_score(target_test, predicted_test)

Exemplo n.º 22

0

Exibir arquivo

    min_samples_leaf=msl,
),
                        param_grid=param6,
                        scoring=scoring,
                        n_jobs=-1,
                        cv=5)
gsearch6.fit(X_train, y_train)
print('best_params:{0}  best_score:{1}'.format(gsearch6.best_params_,
                                               gsearch6.best_score_))
l2r = gsearch6.best_params_['l2_regularization']
# best_params:{'l2_regularization': 0.30000000000000004}  best_score:0.9780450886460196

hgdbt = HistGradientBoostingClassifier(random_state=10,
                                       learning_rate=lr,
                                       max_iter=mi,
                                       max_leaf_nodes=mln,
                                       max_depth=md,
                                       min_samples_leaf=msl,
                                       l2_regularization=l2r)
hgdbt.fit(X_train, y_train)
y_pred = hgdbt.predict(X_test)
c_m = metrics.confusion_matrix(y_test, y_pred)
print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format(c_m[0][0], c_m[1][0],
                                                    c_m[1][1], c_m[0][1]))
print("召回率:%.4f" % metrics.recall_score(y_test, y_pred))
print("查准率:%.4f" % metrics.precision_score(y_test, y_pred))
print("F1：%.4f" % metrics.f1_score(y_test, y_pred))
print("roc_auc:%.4f" % metrics.roc_auc_score(y_test, y_pred))
print("F-measure:%.4f" % (metrics.recall_score(y_test, y_pred) *
                          metrics.precision_score(y_test, y_pred)))

Exemplo n.º 23

0

Exibir arquivo

Arquivo: ensemble_v2.py Projeto: jieuhyl/Machine_Learning

                           scoring='roc_auc',
                           cv = 3,
                           verbose = 10,
                           n_jobs = -1)

start_time = time.time()
grid_search = grid_search.fit(X_train, y_train)
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
grid_search.best_params_, grid_search.best_score_


# last step
clf_hgb = grid_search.best_estimator_
clf_hgb.fit(X_train, y_train)

y_pred = clf_hgb.predict(X_test)
print(classification_report(y_test, y_pred))

y_pred = clf_hgb.predict_proba(X_test)[:, 1]
print('HGB AUC_ROC: %.3f' % roc_auc_score(y_test, y_pred))


# KF & RS
parameters = {'learning_rate': uniform(0,0.1), 
              'max_depth':sp_randint(3, 11),
              'max_leaf_nodes':sp_randint(2, 32),
              'min_samples_leaf':sp_randint(1, 11),
              'max_iter':[400,600,800,1000,1200],
              'l2_regularization':uniform(0,0.1)}

rand_search = RandomizedSearchCV(estimator = clf_hgb,

Exemplo n.º 24

0

Exibir arquivo

x_test = scaler.transform(x_test)

# 모델 구성
model = HistGradientBoostingClassifier(verbose=1, random_state=42, validation_fraction=0.2)
model.fit(x_train, y_train)

# model & weight save
pickle.dump(model, open('C:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\HBC_4_val2.data', 'wb')) # wb : write
print("== save complete ==")

# model load
# model = pickle.load(open('C:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\HBC_4_val2.data', 'rb'))  # rb : read
# time >>  

# evaluate
y_pred = model.predict(x_test)
# print(y_pred[:100])
# print(y_pred[100:])

accuracy = accuracy_score(y_test, y_pred)
log_loss = log_loss(y_test, y_pred)

print("log_loss : \t", log_loss)                        # Cross-entropy loss와 유사한 개념
print("accuracy : \t", accuracy)

pred = ['C:\\nmb\\nmb_data\\5s_last_0510\\predict_04_26\\F', 'C:\\nmb\\nmb_data\\5s_last_0510\\predict_04_26\\M']

count_f = 0
count_m = 0

for pred_pathAudio in pred:

Exemplo n.º 25

0

Exibir arquivo

Arquivo: sota.py Projeto: ESA-PhiLab/floatingobjects

        print(
            classification_report(y_test,
                                  y_pred,
                                  target_names=["water", "floating objects"]))

        #### Hist-based Gradient Boosting Classifier ####
        from sklearn.experimental import enable_hist_gradient_boosting  # noqa
        from sklearn.ensemble import HistGradientBoostingClassifier
        #x,y = draw_N_datapoints(dataset, N=1000)
        clf_hgb = HistGradientBoostingClassifier()
        X_train, X_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        clf_hgb.fit(X_train, y_train)
        y_pred = clf_hgb.predict(X_test)
        print(
            classification_report(y_test,
                                  y_pred,
                                  target_names=["water", "floating objects"]))

    ######## Trained model
    # path to the model
    #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_24_12_2020.pth.tar'
    #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_19_01_2021.pth.tar'
    #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_ratio10_22_01_2021.pth.tar'
    model_path = f'models/{net}-cross-val-2fold/model_{seed}.pth.tar'
    print(model_path)

    #model = UNet(n_channels=12, n_classes=1, bilinear=False).to(device)
    model = get_model(net, inchannels=12).to(device)

Exemplo n.º 26

0

Exibir arquivo

    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1129)
    eval_reg.fit(X_train.copy(), y_train_adr.copy())
    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(
        y_test_adr.copy(), eval_reg.predict(X_test.copy()), X_test.shape[1]
    )
    print(report)

    # eval_clf = RandomForestClassifier(random_state=1129)
    eval_clf = HistGradientBoostingClassifier(random_state=1129)
    eval_clf.fit(X_train.copy(), y_train_canceled.copy())
    print("-" * 10, "classification report", "-" * 10)
    report = classification_report(
        y_test_canceled.copy(), eval_clf.predict(X_test.copy())
    )
    print(report)

    #%%
    pred_df = predict(eval_clf, eval_reg, X_test_df)
    pred_label_df = data.to_label(pred_df)
    label_df = data.get_true_label(columns=["adr", "revenue", "is_canceled", "label"])

    print("[ label evaluation ]")
    report_label = evaluate_by_label(pred_label_df, label_df, target="label")
    print(report_label)
    print("[ revenue_per_day evaluation ]")
    report_revenue = evaluate_by_label(pred_label_df, label_df, target="revenue")
    print(report_revenue)

Exemplo n.º 27

0

Exibir arquivo

print("recall")
print(recall)
print("f1score")
print(f1)
print("Confusion Matrix(Multilabel):")
print(sm.multilabel_confusion_matrix(y_test, y_predict))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predict))
print("Classification Report:")
print(classification_report(y_test, y_predict))
"""
#HISTOGRAMBOOSTINGCLASSIFIER
print("HISTOGRAMBOOSTING_CLASSIFIER:")
Hgb = HistGradientBoostingClassifier()
Hgb.fit(x_train, y_train)
hgb_predict = Hgb.predict(x_test)
#print(y_test.head())
#print(hgb_predict)
acc = r2_score(y_test, hgb_predict)
accuracy = accuracy_score(y_test, hgb_predict)
recall = recall_score(y_test, hgb_predict, average='macro')
precision = precision_score(y_test,
                            hgb_predict,
                            pos_label=1,
                            average='macro',
                            sample_weight=None,
                            zero_division=0)
f1 = f1_score(y_test, hgb_predict, average='macro')
print("Histogram Gradient Boosting Classifier(r2_score):-")
print(acc)
print("Accuracy:")

Exemplo n.º 28

0

Exibir arquivo

Arquivo: Monolithic AD & ML Approaches and Why They are Unsatisfactory.py Projeto: tvanh/h1st

df[(df.Timestamp >= 200) & (df.Timestamp <= 330)].YawRate.dropna().plot()
plt.title("An period with both normal and attacks of YawRate, can you tell which is which?")
plt.show()

df[(df.Timestamp > 315) & (df.Timestamp < 316)].YawRate.dropna().plot()
plt.title("An attack window on YawRate, zooming in to show zig-zagging between real vs injected values ")
plt.show()

Let’s try a gradient-boosted trees firstly, e.g. sklearn’s HistGradientBoostingClassifier can work well on larger dataset before bringing out bigger guns.

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

gbc = HistGradientBoostingClassifier(max_iter=500).fit(df[FEATURES], df.Label == "Attack")

ypred = gbc.predict(df2[FEATURES])

cf = sklearn.metrics.confusion_matrix(df2.Label == "Attack", ypred)
print(sklearn.metrics.accuracy_score(df2.Label == "Attack", ypred))
print(cf)

print("Accuracy = %s " % sklearn.metrics.accuracy_score(df2.Label == "Attack", ypred))

### 2c. Deep Learning and using a H1ST Model API, organizing, importing, saving & loading

We can bring out larger guns like Bidirectional LSTM or CNN or Transformers which can work well on pattern recognition problems on sequential data such as this one. One such model is available in the full tutorial source code package, and it can reach quite impressive accuracy.

Let's see how we could use it!

import h1st as h1
h1.init()

Exemplo n.º 29

0

Exibir arquivo

Arquivo: flights.py Projeto: draftedus/tangram

    categorical_columns = [
        column for column in categorical_columns
        if column != target_column_name
    ]
    model = CatBoostClassifier(cat_features=categorical_columns,
                               grow_policy='Lossguide',
                               learning_rate=0.1,
                               n_estimators=100,
                               num_leaves=255,
                               train_dir='data/catboost_info',
                               verbose=False)
    model.fit(features_train, labels_train, silent=True)

# Make predictions on the test data.
if args.library == 'h2o':
    predictions_proba = model.predict(data_test).as_data_frame()['Y']
else:
    predictions_proba = model.predict_proba(features_test)[:, 1]

# Compute metrics.
auc_roc = roc_auc_score(labels_test, predictions_proba)

# Compute memory usage.
f = open("/proc/self/status", "r")
for line in f.readlines():
    if line.startswith("VmHWM"):
        memory = line.split(":")[1].strip()

print(json.dumps({
    'auc_roc': auc_roc,
    'memory': memory,

Exemplo n.º 30

0

Exibir arquivo

def HGB():
    actions = [
        'change_lane', 'pull_over', 'slow', 'stop', 'straight', 'turn_left',
        'turn_right', 'wait_to_turn_left'
    ]

    data_points = []
    with open('/Users/zephyryau/Documents/study/INF552/Project/data.csv',
              'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                data_points.append(sample)

    data_points_xycl = np.array(data_points)
    data_points_xyc = data_points_xycl[:, :-1]
    y = data_points_xycl[:, -1]

    # centralize datapoints and normalize
    data_points_xy_cent = []
    for row in data_points_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        data_points_xy_cent.append(new_row)

    result_point = []
    with open(
            '/Users/zephyryau/Documents/study/INF552/Project/input_picture/result.csv',
            'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                result_point.append(sample)

    result_point_xycl = np.array(result_point)
    result_point_xyc = result_point_xycl[:, :-1]
    result_point_y = result_point_xycl[:, -1]

    result_point_xy_cent = []
    for row in result_point_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        result_point_xy_cent.append(new_row)
    '''sum = 0
    gesture_results = []
    for i in range(100):
        data_points_xy_train, data_points_xy_test, y_train, y_test = train_test_split(data_points_xy_cent, y, test_size=0.3)
        clf = MLPClassifier(hidden_layer_sizes=(512,))
        clf.fit(data_points_xy_train, y_train)
        gesture_results.append(clf.predict([result_point_xy_cent[0]])[0])
        score = clf.score(data_points_xy_test, y_test)
        #print(score)
        sum += score'''

    X_train, X_test, y_train, y_test = train_test_split(data_points_xy_cent,
                                                        y,
                                                        test_size=0.4)
    scaler = preprocessing.StandardScaler().fit(X_train)
    #print(scaler.mean_)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    r_X_scaled = scaler.transform(result_point_xy_cent)

    sum = 0
    clf = HistGradientBoostingClassifier()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            data_points_xy_cent, y, test_size=0.4)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        clf.fit(X_train_scaled, y_train)
        #print(clf.n_iter_, end=" ")=
        score_train = clf.score(X_train_scaled, y_train)
        #print(score_train, end=" ")
        score_test = clf.score(X_test_scaled, y_test)
        sum += score_test  #print(score_test)

    tf = (clf.predict([r_X_scaled[0]])[0] == result_point_y[0])

    return clf.predict([r_X_scaled[0]])[0], sum / 10, tf

Exemplo n.º 31

0

Exibir arquivo

Arquivo: plot_release_highlights_0_22_0.py Projeto: yananhou/scikit-learn

# -------------------------------------------------------
#
# The :class:`ensemble.HistGradientBoostingClassifier`
# and :class:`ensemble.HistGradientBoostingRegressor` now have native
# support for missing values (NaNs). This means that there is no need for
# imputing data when training or predicting.

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np

X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
y = [0, 0, 1, 1]

gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
print(gbdt.predict(X))

# %%
# Precomputed sparse nearest neighbors graph
# ------------------------------------------
# Most estimators based on nearest neighbors graphs now accept precomputed
# sparse graphs as input, to reuse the same graph for multiple estimator fits.
# To use this feature in a pipeline, one can use the `memory` parameter, along
# with one of the two new transformers,
# :class:`neighbors.KNeighborsTransformer` and
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
# can also be performed by custom estimators to use alternative
# implementations, such as approximate nearest neighbors methods.
# See more details in the :ref:`User Guide <neighbors_transformer>`.

from tempfile import TemporaryDirectory

Exemplo n.º 32

0

Exibir arquivo

Arquivo: HGBT_good_30062020.py Projeto: emilpaulitz/Bachelor-Thesis

        p = clf.predict_proba(
            data.loc[:,
                     data.columns != 'Label'])[:,
                                               1]  # prob for class=1 (target)
        p = pd.DataFrame({'p-value': p})
        data.reset_index(drop=True, inplace=True)
        p.reset_index(drop=True, inplace=True)
        data2 = pd.concat([data, p], axis=1)
        data2 = calcQ(data2, scoreColName="p-value")
        data2["Rank"] = 1
        # store best fit
        nXLauc, XLauc = evalXL(data2, plot=False, maxQ=0.1)
        print("pAUC(peptides), pAUC(XLs): " + str(nXLauc) + "\t" + str(XLauc))
        print("sum(pAUC): " + str(nXLauc + XLauc))
        print("Confusion matrix:")
        print(confusion_matrix(y, clf.predict(X)))

        if nXLauc + 10.0 * XLauc > best_nXLauc + 10.0 * best_XLauc:  # we weight XL auc higher than peptide auc
            best_nXLauc = nXLauc
            best_XLauc = XLauc
            best_alpha = alpha
            best_beta = beta
            best_clf = deepcopy(clf)

print("Best alpha, beta: " + str(alpha) + "\t" + str(beta))
print("pAUC(peptides), pAUC(XLs): " + str(best_nXLauc) + "\t" +
      str(best_XLauc))
print("sum(pAUC): " + str(best_nXLauc + best_XLauc))

p = best_clf.predict_proba(
    data.loc[:, data.columns != 'Label'])[:, 1]  # prob for class=1 (target)