def test_zero_sample_weights_classification(): # Make sure setting a SW to zero amounts to ignoring the corresponding # sample X = [[1, 0], [1, 0], [1, 0], [0, 1]] y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] gb = HistGradientBoostingClassifier(loss='binary_crossentropy', min_samples_leaf=1) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1]) X = [[1, 0], [1, 0], [1, 0], [0, 1], [1, 1]] y = [0, 0, 1, 0, 2] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1, 1] gb = HistGradientBoostingClassifier(loss='categorical_crossentropy', min_samples_leaf=1) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1])
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 max_bins = 255 X, y = make_classification( n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0, ) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss="binary_crossentropy", max_iter=max_iter, max_bins=max_bins, learning_rate=1, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, ) est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
class GradientBoostingMsgClassifierModel(h1.MLModel): def load_data(self, num_files=None): return utils.load_data(num_files, shuffle=False) def prep(self, data): def concat_processed_files(files): dfs = [] for f in files: z = pd.read_parquet(f) z = utils.compute_timediff_fillna(z, dropna_subset=FEATURES) dfs.append(z) df2 = pd.concat(dfs) return df2 split = int(len(data["attack_files"])*0.5) train_files = data["attack_files"][:split] test_files = data["attack_files"][split:] result = { "train_files": train_files, "test_files": test_files, "train_attack_df": concat_processed_files(train_files), "test_attack_df": concat_processed_files(test_files) } print("len train_attack_df = %s" % len(result["train_attack_df"])) print("len test_attack_df = %s" % len(result["test_attack_df"])) return result def train(self, prepared_data): df = prepared_data["train_attack_df"] from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier X = df[FEATURES] y = df.Label == config.ATTACK_LABEL self.base_model = HistGradientBoostingClassifier(max_iter=500).fit(X, y) def evaluate(self, prepared_data): df = prepared_data["test_attack_df"] ypred = self.base_model.predict(df[FEATURES]) import sklearn.metrics cf = sklearn.metrics.confusion_matrix(df.Label == config.ATTACK_LABEL, ypred) acc = sklearn.metrics.accuracy_score(df.Label == config.ATTACK_LABEL, ypred) print(cf) print("Accuracy = %.4f" % acc) self.metrics = {"confusion_matrix": cf, "accuracy": acc} def predict(self, data): df = data["df"].copy() df = utils.compute_timediff_fillna(df) df['MsgIsAttack'] = 0 df['WindowInAttack'] = 0 for event_result in data["event_detection_results"]: if event_result['WindowInAttack']: # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result)) in_window = (df.Timestamp >= event_result['window_start']) & (df.Timestamp < event_result['window_start'] + config.WINDOW_SIZE) w_df = df[in_window] if len(w_df) > 0: ypred = self.base_model.predict(w_df[FEATURES]) df.loc[in_window, "WindowInAttack"] = 1 df.loc[in_window, "MsgIsAttack"] = ypred.astype(int) return {"injection_window_results": df}
class GradientBoostingMsgClassifierModel(h1.Model): def load_data(self, num_samples=None): return util.load_data_daic(num_samples, shuffle=True) def prep_data(self, data): # concat multiple files into separate training/test pd.DataFrame def concat_processed_files(files): dfs = [] for f in files: z = pd.read_csv(f) z.columns = ['Timestamp', 'Label', 'CarSpeed', 'SteeringAngle', 'YawRate', 'Gx', 'Gy',] z = util.compute_timediff_fillna(z) dfs.append(z) df2 = pd.concat(dfs) return df2 return { "train_attack_df": concat_processed_files(data["train_attack_files"]), "test_attack_df": concat_processed_files(data["test_attack_files"]) } def train(self, prepared_data): df = prepared_data["train_attack_df"] from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier X = df[FEATURES] y = df.Label == "Tx" self.model = HistGradientBoostingClassifier(max_iter=500).fit(X, y) def evaluate(self, data): df = prepared_data["test_attack_df"] ypred = self.model.predict(df[FEATURES]) import sklearn.metrics cf = sklearn.metrics.confusion_matrix(df.Label == "Tx", ypred) acc = sklearn.metrics.accuracy_score(df.Label == "Tx", ypred) print(cf) print("Accuracy = %.4f" % acc) self.metrics = {"confusion_matrix": cf, "accuracy": acc} def predict(self, data): df = data["df"].copy() df = util.compute_timediff_fillna(df) df['MsgIsAttack'] = 0 df['WindowInAttack'] = 0 for event_result in data["event_detection_results"]: if event_result['WindowInAttack']: # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result)) in_window = (df.Timestamp >= event_result['window_start']) & (df.Timestamp < event_result['window_start'] + WINDOW_SIZE) w_df = df[in_window] ypred = self.model.predict(w_df[FEATURES]) df.loc[in_window, "WindowInAttack"] = 1 df.loc[in_window, "MsgIsAttack"] = ypred.astype(int) return {"injection_window_results": df}
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def k_fold_cross_val(Xs, y_var, k=10): clf = tree.DecisionTreeClassifier() clf_forest = RandomForestClassifier(n_estimators=10) clf_boost = HistGradientBoostingClassifier() num_folds = k N = Xs.shape[0] test_size = int(N / num_folds) test_idxs = np.random.permutation(N)[:num_folds * test_size].reshape( num_folds, test_size) total_score = np.asarray([0., 0., 0.]) total_F1_score = np.asarray([0., 0., 0.]) for i in range(num_folds): print("Iteration " + str(i) + ":") test_i = Xs.index.isin(test_idxs[i]) df_train, df_test = Xs[~test_i], Xs[test_i] y_train, y_test = y_var[~test_i], y_var[test_i] clf = clf.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_b = clf.score(df_test.to_numpy(), y_test.to_numpy().ravel()) clf_forest = clf_forest.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_f = clf_forest.score(df_test.to_numpy(), y_test.to_numpy().ravel()) clf_boost = clf_boost.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_h = clf_boost.score(df_test.to_numpy(), y_test.to_numpy().ravel()) y_hat = clf.predict(df_test.to_numpy()) f1_b = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (tree):", f1_b) y_hat = clf_forest.predict(df_test.to_numpy()) f1_f = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (forest):", f1_f) y_hat = clf_boost.predict(df_test.to_numpy()) f1_boost = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (boost):", f1_boost) print("Prediction scores for (tree,forest,boost):", score_b, score_f, score_h) total_score += np.asarray([score_b, score_f, score_h]) total_F1_score += np.asarray([f1_b, f1_f, f1_boost]) print("Avg. accuracy scores for (tree,forest,boost):", total_score / num_folds) print("Avg. F1 scores for (tree,forest,boost):", total_F1_score / num_folds) return clf, clf_forest, clf_boost
def k_fold_trainning(rawdata,n_folds=5): cv = StratifiedKFold(n_splits=n_folds,shuffle=True) target = np.array(rawdata[0].values) lure = np.array(rawdata[1].values) y = np.array(rawdata['label'].values) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots() for i, (train, test) in enumerate(cv.split(target, lure, y)): print('----------------Training Fold %d---------------'%(i+1)) X_train = pd.DataFrame({0:target[train],1:lure[train]}) X_test = pd.DataFrame({0:target[test],1:lure[test]}) pmfm = create_pmfm(X_train,y[train]) train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100) train_data = np.matrix([train_feature[i] for i in range(train_feature.shape[0])]) test_data = np.matrix([test_feature[i] for i in range(test_feature.shape[0])]) clf.fit(train_data, y[train]) pred = clf.predict(test_data) evaluate(y[test], pred) viz = plot_roc_curve(clf, test_data, y[test], name='ROC fold {}'.format(i+1), alpha=0.5, lw=1, ax=ax) interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic Curve") ax.legend(loc="lower right") plt.savefig('roc.png',dpi=300)
def clasificar_HistGradientBoostingClassifier(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") scoreArray = np.array([]) clf = HistGradientBoostingClassifier() scores = cross_val_score(clf, X, y, cv=10) clf = clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain * 100, precisionTest * 100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
def gradient_boost(train_data, test_data): train_y = train_data['state'] train_X = train_data.iloc[:, FEATURES_INDICES] test_y = test_data['state'] test_X = test_data.iloc[:, FEATURES_INDICES] #search(train_X, train_y) #search_xgboost(train_X, train_y) gd = HistGradientBoostingClassifier(loss='auto', max_bins=200, max_depth=10, max_leaf_nodes=35) #gd = XGBClassifier() gd.fit(train_X, train_y) pred_y = gd.predict(test_X) evaluate(gd, test_X, test_y, pred_y)
def main(): # loading the dataset from sklearn.datasets df_cancer = load_breast_cancer() print(df_cancer.keys()) X = df_cancer.data y = df_cancer.target print("number of classes are: ", np.unique(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # create object of historgradientboosting hist = HistGradientBoostingClassifier() # training the model hist.fit(X_train, y_train) y_pred = hist.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("accuracy of the model is: ", accuracy) clr = classification_report(y_test, y_pred) print("Classification report is:", clr)
def test_on_target(rawdata, sitename): print('------------Testing on %s-----------' % sitename) target_info = pd.read_csv("target_info.csv") if sitename in target_info['Site'].values: target_dict = target_info.set_index('Site').T.to_dict() sequence = target_dict[sitename]['Sequence'] train_data = rawdata[rawdata[0]!=sequence] test_data = rawdata[rawdata[0]==sequence] X_train, y_train = create_Input(train_data) X_test, y_test = create_Input(test_data) pmfm = create_pmfm(X_train,y_train) train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100) train_matrix = np.matrix([train_feature[i] for i in range(train_feature.shape[0])]) test_matrix = np.matrix([test_feature[i] for i in range(test_feature.shape[0])]) clf.fit(train_matrix, y_train) pred = clf.predict(test_matrix) evaluate(y_test, pred) else: print('ERROR: INCORRECT SITE NAME')
def automatedHistGB(train_X, train_y, test_X, test_y): """Executes Histogram-based Gradient Boosting Classifier. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedHistGB', 'Starting') param_grid = {'max_iter': [1000, 1200, 1500], 'learning_rate': [0.1], 'max_depth': [25, 50, 75]} model = HistGradientBoostingClassifier() model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
n_estimators = n_trees // n_classes if objective == 'categorical_crossentropy' else n_trees print("Currently processing {}...".format(dataset)) model = HistGradientBoostingClassifier( max_iter=n_estimators, loss=objective, validation_fraction=None ) tic = time.time() model.fit(X_train, y_train) toc = time.time() training_time = toc - tic tic = time.time() y_pred = model.predict(X_test) toc = time.time() testing_time = toc - tic testing_acc = accuracy_score(y_test, y_pred) records.append((dataset, training_time, testing_time, testing_acc)) # Write a log file with open("all_hgbdt_classification.txt", 'w') as file: for dataset, training_time, testing_time, testing_acc in records: string = "{}\t{:.5f}\t{:.5f}\t{:.5f}\n".format( dataset, training_time, testing_time, testing_acc) file.write(string) file.close()
max_features=max_features, verbose=0, warm_start=warm_start, presort='deprecated') clf.fit(X_train, y_train) print("n_estimators : ", n_estimators) print("learning rate :", lr) print("Accuracy score (training): {0:.3f}".format( clf.score(X_train, y_train))) print("Accuracy score (validation): {0:.3f}".format( clf.score(X_test, y_test))) print("\n") filename = 'LGBM' + str(n_estimators) + str(lr) + str( max_depth) + str(max_features) + str(warm_start) + str( clf.score(X_test, y_test)) + "%" + '.sav' if clf.score(X_test, y_test) > 0.93: pickle.dump(clf, open(filename, 'wb')) y_predict = clf.predict(X_test) score = accuracy_score(y_test, y_predict) print("n_estimators = ", n_estimators) print("max_features = ", max_features) print("warm_start = ", warm_start) print(score) print(confusion_matrix(y_test, y_predict)) filename = 'model.sav' pickle.dump(clf, open(filename, 'wb'))
n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") print("Fitting a sklearn model...") tic = time() est = HistGradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, early_stopping=False, random_state=0, verbose=1) est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") if args.lightgbm: print("Fitting a LightGBM model...") tic = time() lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) predicted_proba_test = lightgbm_est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test)
col = [ 'acc', 'UP_precision', 'DOWN_precision', 'PRESERVE_precision', 'UP_recall', 'DOWN_recall', 'PRESERVE_recall' ] res = pd.DataFrame() res = res.append(pd.DataFrame([accuracy_score(test['label'], pred_test)] + \ list(precision_score(test['label'],pred_test, average = None)) + \ list(recall_score(test['label'],pred_test, average = None))).transpose()) # lightgbm from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier hist = HistGradientBoostingClassifier(random_state=0) hist.fit(train.iloc[:, :6780], train['label']) from sklearn.metrics import accuracy_score pred_train = hist.predict(train.iloc[:, :6780]) print(f"Accuracy in train set: {accuracy_score(train['label'], pred_train)}") pred_cv = hist.predict(cv.iloc[:, :6780]) print(f"Accuracy in valid set: {accuracy_score(cv['label'], pred_cv)}") pred_test = hist.predict(test.iloc[:, :6780]) print(f"Accuracy in test set: {accuracy_score(test['label'], pred_test)}") from sklearn.metrics import confusion_matrix confusion_mat = confusion_matrix(test['label'], pred_test) plotCM(['UP', 'DOWN', 'PRESERVE'], confusion_mat, 'hist_confusion_matrix') res = res.append(pd.DataFrame([accuracy_score(test['label'], pred_test)] + \ list(precision_score(test['label'],pred_test, average = None)) + \ list(recall_score(test['label'],pred_test, average = None))).transpose()) # mlp from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(random_state=0)
clf_bc = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0) clf_bc.fit(x_train, y_train) bc_pred = clf_bc.predict(x_test) bc_matrices = evaluate_preds(clf_bc, x_test, y_test, bc_pred) # ################################################ ExtraTreesClassifier clf_etc = ExtraTreesClassifier() clf_etc.fit(x_train, y_train) etc_pred = clf_etc.predict(x_test) et_matrices = evaluate_preds(clf_etc, x_test, y_test, etc_pred) # ############################################################ # ############################################################ HistGradientBoostingClassifier clf_hgbc = HistGradientBoostingClassifier() clf_hgbc.fit(x_train, y_train) hgbc_pred = clf_hgbc.predict(x_test) hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred) # ############################################################ # ############################################################ LogisticRegression clf_lr = LogisticRegression() clf_lr.fit(x_train, y_train) clf_pred = clf_lr.predict(x_test) lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred) # ############################################################ # ############################################################ StackingClassifier clf_sc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf_sc.fit(x_train, y_train) clf_pred = clf_sc.predict(x_test) sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred) # ############################################################
print("recall") print(recall) print("f1score") print(f1) print("Confusion Matrix(Multilabel):") print(sm.multilabel_confusion_matrix(y_test, y_predict)) print("Confusion Matrix:") print(confusion_matrix(y_test, y_predict)) print("Classification Report:") print(classification_report(y_test, y_predict)) """ #HISTOGRAMBOOSTINGCLASSIFIER print("HISTOGRAMBOOSTING_CLASSIFIER:") Hgb = HistGradientBoostingClassifier() Hgb.fit(x_train, y_train) hgb_predict = Hgb.predict(x_test) #print(y_test.head()) #print(hgb_predict) acc = r2_score(y_test, hgb_predict) accuracy = accuracy_score(y_test, hgb_predict) recall = recall_score(y_test, hgb_predict, average='macro') precision = precision_score(y_test, hgb_predict, pos_label=1, average='macro', sample_weight=None, zero_division=0) f1 = f1_score(y_test, hgb_predict, average='macro') print("Histogram Gradient Boosting Classifier(r2_score):-") print(acc) print("Accuracy:")
print("Evaluating classifiers...") print("#" * 128) print("Gradient Boosting Classifier:") print("Test:") print(metrics.classification_report(y_test, t.predict(X_test))) print(metrics.confusion_matrix(y_test, t.predict(X_test))) print("Training:") print(metrics.classification_report(y_train, t.predict(X_train))) print(metrics.confusion_matrix(y_train, t.predict(X_train))) print("#" * 128) print("Hist Gradient Boosting Classifier:") print("Test:") print(metrics.classification_report(y_test, e.predict(X_test))) print(metrics.confusion_matrix(y_test, e.predict(X_test))) print("Training:") print(metrics.classification_report(y_train, e.predict(X_train))) print(metrics.confusion_matrix(y_train, e.predict(X_train))) print("#" * 128) print("LightGBM Classifier:") p = lgb_model.predict(X_test) predictions = [] for x in p: predictions.append(np.argmax(x)) print("Test:") print(metrics.classification_report(y_test, predictions)) print(metrics.confusion_matrix(y_test, predictions))
def test_same_predictions_multiclass_classification( seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=lr, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") print("Fitting a sklearn model...") tic = time() est = HistGradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None, random_state=0, verbose=1) est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") if args.lightgbm: print("Fitting a LightGBM model...") tic = time() lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) predicted_proba_test = lightgbm_est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test)
min_samples_leaf=msl, ), param_grid=param6, scoring=scoring, n_jobs=-1, cv=5) gsearch6.fit(X_train, y_train) print('best_params:{0} best_score:{1}'.format(gsearch6.best_params_, gsearch6.best_score_)) l2r = gsearch6.best_params_['l2_regularization'] # best_params:{'l2_regularization': 0.30000000000000004} best_score:0.9780450886460196 hgdbt = HistGradientBoostingClassifier(random_state=10, learning_rate=lr, max_iter=mi, max_leaf_nodes=mln, max_depth=md, min_samples_leaf=msl, l2_regularization=l2r) hgdbt.fit(X_train, y_train) y_pred = hgdbt.predict(X_test) c_m = metrics.confusion_matrix(y_test, y_pred) print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format(c_m[0][0], c_m[1][0], c_m[1][1], c_m[0][1])) print("召回率:%.4f" % metrics.recall_score(y_test, y_pred)) print("查准率:%.4f" % metrics.precision_score(y_test, y_pred)) print("F1:%.4f" % metrics.f1_score(y_test, y_pred)) print("roc_auc:%.4f" % metrics.roc_auc_score(y_test, y_pred)) print("F-measure:%.4f" % (metrics.recall_score(y_test, y_pred) * metrics.precision_score(y_test, y_pred)))
scoring='roc_auc', cv = 3, verbose = 10, n_jobs = -1) start_time = time.time() grid_search = grid_search.fit(X_train, y_train) print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2))) grid_search.best_params_, grid_search.best_score_ # last step clf_hgb = grid_search.best_estimator_ clf_hgb.fit(X_train, y_train) y_pred = clf_hgb.predict(X_test) print(classification_report(y_test, y_pred)) y_pred = clf_hgb.predict_proba(X_test)[:, 1] print('HGB AUC_ROC: %.3f' % roc_auc_score(y_test, y_pred)) # KF & RS parameters = {'learning_rate': uniform(0,0.1), 'max_depth':sp_randint(3, 11), 'max_leaf_nodes':sp_randint(2, 32), 'min_samples_leaf':sp_randint(1, 11), 'max_iter':[400,600,800,1000,1200], 'l2_regularization':uniform(0,0.1)} rand_search = RandomizedSearchCV(estimator = clf_hgb,
x_test = scaler.transform(x_test) # 모델 구성 model = HistGradientBoostingClassifier(verbose=1, random_state=42, validation_fraction=0.2) model.fit(x_train, y_train) # model & weight save pickle.dump(model, open('C:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\HBC_4_val2.data', 'wb')) # wb : write print("== save complete ==") # model load # model = pickle.load(open('C:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\HBC_4_val2.data', 'rb')) # rb : read # time >> # evaluate y_pred = model.predict(x_test) # print(y_pred[:100]) # print(y_pred[100:]) accuracy = accuracy_score(y_test, y_pred) log_loss = log_loss(y_test, y_pred) print("log_loss : \t", log_loss) # Cross-entropy loss와 유사한 개념 print("accuracy : \t", accuracy) pred = ['C:\\nmb\\nmb_data\\5s_last_0510\\predict_04_26\\F', 'C:\\nmb\\nmb_data\\5s_last_0510\\predict_04_26\\M'] count_f = 0 count_m = 0 for pred_pathAudio in pred:
print( classification_report(y_test, y_pred, target_names=["water", "floating objects"])) #### Hist-based Gradient Boosting Classifier #### from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier #x,y = draw_N_datapoints(dataset, N=1000) clf_hgb = HistGradientBoostingClassifier() X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) clf_hgb.fit(X_train, y_train) y_pred = clf_hgb.predict(X_test) print( classification_report(y_test, y_pred, target_names=["water", "floating objects"])) ######## Trained model # path to the model #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_24_12_2020.pth.tar' #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_19_01_2021.pth.tar' #model_path=os.environ['HOME'] + '/remote/floatingobjects/models/model_ratio10_22_01_2021.pth.tar' model_path = f'models/{net}-cross-val-2fold/model_{seed}.pth.tar' print(model_path) #model = UNet(n_channels=12, n_classes=1, bilinear=False).to(device) model = get_model(net, inchannels=12).to(device)
#%% evaluate performance with training data eval_reg = HistGradientBoostingRegressor(random_state=1129) eval_reg.fit(X_train.copy(), y_train_adr.copy()) print("-" * 10, "regression report", "-" * 10) report = regression_report( y_test_adr.copy(), eval_reg.predict(X_test.copy()), X_test.shape[1] ) print(report) # eval_clf = RandomForestClassifier(random_state=1129) eval_clf = HistGradientBoostingClassifier(random_state=1129) eval_clf.fit(X_train.copy(), y_train_canceled.copy()) print("-" * 10, "classification report", "-" * 10) report = classification_report( y_test_canceled.copy(), eval_clf.predict(X_test.copy()) ) print(report) #%% pred_df = predict(eval_clf, eval_reg, X_test_df) pred_label_df = data.to_label(pred_df) label_df = data.get_true_label(columns=["adr", "revenue", "is_canceled", "label"]) print("[ label evaluation ]") report_label = evaluate_by_label(pred_label_df, label_df, target="label") print(report_label) print("[ revenue_per_day evaluation ]") report_revenue = evaluate_by_label(pred_label_df, label_df, target="revenue") print(report_revenue)
print("recall") print(recall) print("f1score") print(f1) print("Confusion Matrix(Multilabel):") print(sm.multilabel_confusion_matrix(y_test, y_predict)) print("Confusion Matrix:") print(confusion_matrix(y_test, y_predict)) print("Classification Report:") print(classification_report(y_test, y_predict)) """ #HISTOGRAMBOOSTINGCLASSIFIER print("HISTOGRAMBOOSTING_CLASSIFIER:") Hgb = HistGradientBoostingClassifier() Hgb.fit(x_train, y_train) hgb_predict = Hgb.predict(x_test) #print(y_test.head()) #print(hgb_predict) acc = r2_score(y_test, hgb_predict) accuracy = accuracy_score(y_test, hgb_predict) recall = recall_score(y_test, hgb_predict, average='macro') precision = precision_score(y_test, hgb_predict, pos_label=1, average='macro', sample_weight=None, zero_division=0) f1 = f1_score(y_test, hgb_predict, average='macro') print("Histogram Gradient Boosting Classifier(r2_score):-") print(acc) print("Accuracy:")
df[(df.Timestamp >= 200) & (df.Timestamp <= 330)].YawRate.dropna().plot() plt.title("An period with both normal and attacks of YawRate, can you tell which is which?") plt.show() df[(df.Timestamp > 315) & (df.Timestamp < 316)].YawRate.dropna().plot() plt.title("An attack window on YawRate, zooming in to show zig-zagging between real vs injected values ") plt.show() Let’s try a gradient-boosted trees firstly, e.g. sklearn’s HistGradientBoostingClassifier can work well on larger dataset before bringing out bigger guns. from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier gbc = HistGradientBoostingClassifier(max_iter=500).fit(df[FEATURES], df.Label == "Attack") ypred = gbc.predict(df2[FEATURES]) cf = sklearn.metrics.confusion_matrix(df2.Label == "Attack", ypred) print(sklearn.metrics.accuracy_score(df2.Label == "Attack", ypred)) print(cf) print("Accuracy = %s " % sklearn.metrics.accuracy_score(df2.Label == "Attack", ypred)) ### 2c. Deep Learning and using a H1ST Model API, organizing, importing, saving & loading We can bring out larger guns like Bidirectional LSTM or CNN or Transformers which can work well on pattern recognition problems on sequential data such as this one. One such model is available in the full tutorial source code package, and it can reach quite impressive accuracy. Let's see how we could use it! import h1st as h1 h1.init()
categorical_columns = [ column for column in categorical_columns if column != target_column_name ] model = CatBoostClassifier(cat_features=categorical_columns, grow_policy='Lossguide', learning_rate=0.1, n_estimators=100, num_leaves=255, train_dir='data/catboost_info', verbose=False) model.fit(features_train, labels_train, silent=True) # Make predictions on the test data. if args.library == 'h2o': predictions_proba = model.predict(data_test).as_data_frame()['Y'] else: predictions_proba = model.predict_proba(features_test)[:, 1] # Compute metrics. auc_roc = roc_auc_score(labels_test, predictions_proba) # Compute memory usage. f = open("/proc/self/status", "r") for line in f.readlines(): if line.startswith("VmHWM"): memory = line.split(":")[1].strip() print(json.dumps({ 'auc_roc': auc_roc, 'memory': memory,
def HGB(): actions = [ 'change_lane', 'pull_over', 'slow', 'stop', 'straight', 'turn_left', 'turn_right', 'wait_to_turn_left' ] data_points = [] with open('/Users/zephyryau/Documents/study/INF552/Project/data.csv', 'r') as fd: for row in fd: row_list = row[:-1].split(',') sample = [float(i) for i in row_list[:-1]] sample.append(actions.index(row_list[-1])) if len(sample) == 76: data_points.append(sample) data_points_xycl = np.array(data_points) data_points_xyc = data_points_xycl[:, :-1] y = data_points_xycl[:, -1] # centralize datapoints and normalize data_points_xy_cent = [] for row in data_points_xyc: # print(row) avg_x = row[3] avg_y = row[4] head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5 shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5 new_row = [] for i in range(16): # first 16 points new_row.append((row[3 * i] - avg_x) / shoulder_length) new_row.append((row[3 * i + 1] - avg_y) / head_length) new_row.append(row[3 * i + 2]) # conf data_points_xy_cent.append(new_row) result_point = [] with open( '/Users/zephyryau/Documents/study/INF552/Project/input_picture/result.csv', 'r') as fd: for row in fd: row_list = row[:-1].split(',') sample = [float(i) for i in row_list[:-1]] sample.append(actions.index(row_list[-1])) if len(sample) == 76: result_point.append(sample) result_point_xycl = np.array(result_point) result_point_xyc = result_point_xycl[:, :-1] result_point_y = result_point_xycl[:, -1] result_point_xy_cent = [] for row in result_point_xyc: # print(row) avg_x = row[3] avg_y = row[4] head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5 shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5 new_row = [] for i in range(16): # first 16 points new_row.append((row[3 * i] - avg_x) / shoulder_length) new_row.append((row[3 * i + 1] - avg_y) / head_length) new_row.append(row[3 * i + 2]) # conf result_point_xy_cent.append(new_row) '''sum = 0 gesture_results = [] for i in range(100): data_points_xy_train, data_points_xy_test, y_train, y_test = train_test_split(data_points_xy_cent, y, test_size=0.3) clf = MLPClassifier(hidden_layer_sizes=(512,)) clf.fit(data_points_xy_train, y_train) gesture_results.append(clf.predict([result_point_xy_cent[0]])[0]) score = clf.score(data_points_xy_test, y_test) #print(score) sum += score''' X_train, X_test, y_train, y_test = train_test_split(data_points_xy_cent, y, test_size=0.4) scaler = preprocessing.StandardScaler().fit(X_train) #print(scaler.mean_) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) r_X_scaled = scaler.transform(result_point_xy_cent) sum = 0 clf = HistGradientBoostingClassifier() for i in range(10): X_train, X_test, y_train, y_test = train_test_split( data_points_xy_cent, y, test_size=0.4) scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) clf.fit(X_train_scaled, y_train) #print(clf.n_iter_, end=" ")= score_train = clf.score(X_train_scaled, y_train) #print(score_train, end=" ") score_test = clf.score(X_test_scaled, y_test) sum += score_test #print(score_test) tf = (clf.predict([r_X_scaled[0]])[0] == result_point_y[0]) return clf.predict([r_X_scaled[0]])[0], sum / 10, tf
# ------------------------------------------------------- # # The :class:`ensemble.HistGradientBoostingClassifier` # and :class:`ensemble.HistGradientBoostingRegressor` now have native # support for missing values (NaNs). This means that there is no need for # imputing data when training or predicting. from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier import numpy as np X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) y = [0, 0, 1, 1] gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) print(gbdt.predict(X)) # %% # Precomputed sparse nearest neighbors graph # ------------------------------------------ # Most estimators based on nearest neighbors graphs now accept precomputed # sparse graphs as input, to reuse the same graph for multiple estimator fits. # To use this feature in a pipeline, one can use the `memory` parameter, along # with one of the two new transformers, # :class:`neighbors.KNeighborsTransformer` and # :class:`neighbors.RadiusNeighborsTransformer`. The precomputation # can also be performed by custom estimators to use alternative # implementations, such as approximate nearest neighbors methods. # See more details in the :ref:`User Guide <neighbors_transformer>`. from tempfile import TemporaryDirectory
p = clf.predict_proba( data.loc[:, data.columns != 'Label'])[:, 1] # prob for class=1 (target) p = pd.DataFrame({'p-value': p}) data.reset_index(drop=True, inplace=True) p.reset_index(drop=True, inplace=True) data2 = pd.concat([data, p], axis=1) data2 = calcQ(data2, scoreColName="p-value") data2["Rank"] = 1 # store best fit nXLauc, XLauc = evalXL(data2, plot=False, maxQ=0.1) print("pAUC(peptides), pAUC(XLs): " + str(nXLauc) + "\t" + str(XLauc)) print("sum(pAUC): " + str(nXLauc + XLauc)) print("Confusion matrix:") print(confusion_matrix(y, clf.predict(X))) if nXLauc + 10.0 * XLauc > best_nXLauc + 10.0 * best_XLauc: # we weight XL auc higher than peptide auc best_nXLauc = nXLauc best_XLauc = XLauc best_alpha = alpha best_beta = beta best_clf = deepcopy(clf) print("Best alpha, beta: " + str(alpha) + "\t" + str(beta)) print("pAUC(peptides), pAUC(XLs): " + str(best_nXLauc) + "\t" + str(best_XLauc)) print("sum(pAUC): " + str(best_nXLauc + best_XLauc)) p = best_clf.predict_proba( data.loc[:, data.columns != 'Label'])[:, 1] # prob for class=1 (target)