def predictMTX(path): mtxCompressed = np.load(path) X = mtxCompressed['savedX'] Y = mtxCompressed['savedY'] print X.shape, Y.shape mod_x = np.reshape(X, (X.shape[0], X.shape[1] * X.shape[2])) print("Loading model") bbc = joblib.load('bag_model.pkl') bag = joblib.load('bag.pkl') std = joblib.load('std.pkl') # svc = joblib.load('svc.pkl') scaler = joblib.load('bag_scaler.pkl') # model = joblib.load('nn_pssm.pkl') scaledTestX = scaler.transform(mod_x) predY1 = bbc.predict(scaledTestX) predY2 = bag.predict(scaledTestX) predY3 = std.predict(scaledTestX) # Classification Metric display print "Balanced Bagging MLP" print(confusion_matrix(Y, predY1)) print(classification_report_imbalanced(Y, predY1)) print(matthews_corrcoef(Y, predY1)) print "Balanced Bagging" print(confusion_matrix(Y, predY2)) print(classification_report_imbalanced(Y, predY2)) print(matthews_corrcoef(Y, predY2)) print "Standard MLP" print(confusion_matrix(Y, predY3)) print(classification_report_imbalanced(Y, predY3)) print(matthews_corrcoef(Y, predY3))
def predictOrganelle(path, seq_path): seqs = [] lengths = [] for seq_record in SeqIO.parse(seq_path, "fasta"): seq = str(seq_record.seq) seqs += [seq] lengths += [len(seq)] temp_org = np.array(buildPredict(seqs)) probs_org = [] for i in range(0, len(lengths)): probs_org += [temp_org[i]] * lengths[i] probs_org = np.array(probs_org) print "Len Probs Org: {} ".format(len(probs_org)) # print probs_org.shape mtxCompressed = np.load(path) X = mtxCompressed['savedX'] Y = mtxCompressed['savedY'] # print X.shape, Y.shape mod_x = np.reshape(X, (X.shape[0], X.shape[1] * X.shape[2])) base = joblib.load('org_base.pkl') scaler = joblib.load('bag_scaler.pkl') scaledTestX = scaler.transform(mod_x) base_probs = base.predict_proba(scaledTestX) # print base_probs.shape org_X = np.hstack((base_probs, probs_org)) print org_X.shape bbc = joblib.load('org_bbc.pkl') bag = joblib.load('org_bag.pkl') std = joblib.load('org_std.pkl') svc = joblib.load('org_svc.pkl') org_scaler = joblib.load('org_scaler.pkl') scaled_org_test = org_scaler.transform(org_X) predY1 = bbc.predict(scaled_org_test) predY2 = bag.predict(scaled_org_test) predY3 = std.predict(scaled_org_test) predY4 = svc.predict(scaled_org_test) # Classification Metric display print "Balanced Bagging MLP" print(confusion_matrix(Y, predY1)) print(classification_report_imbalanced(Y, predY1)) print(matthews_corrcoef(Y, predY1)) print "Balanced Bagging" print(confusion_matrix(Y, predY2)) print(classification_report_imbalanced(Y, predY2)) print(matthews_corrcoef(Y, predY2)) print "Standard MLP" print(confusion_matrix(Y, predY3)) print(classification_report_imbalanced(Y, predY3)) print(matthews_corrcoef(Y, predY3)) print "SVC" print(confusion_matrix(Y, predY4)) print(classification_report_imbalanced(Y, predY4)) print(matthews_corrcoef(Y, predY4))
def test_classification_report_imbalanced_multiclass_with_digits(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 ' '0.92157 0.80851 0.86409 0.74085 24 versicolor ' '0.33333 0.09677 0.86364 0.15000 0.43809 0.18727 31 ' 'virginica 0.41860 0.90000 0.54545 0.57143 0.62645 ' '0.37208 20 avg / total 0.51375 0.53333 0.79733 ' '0.47310 0.62464 0.41370 75') report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, digits=5) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 ' '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 ' '0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def test_classification_report_imbalanced_multiclass(): """Test classification report for multiclass problem""" iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ('pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 ' '0.81 0.86 0.74 24 versicolor 0.33 0.10 0.86 0.15 ' '0.44 0.19 31 virginica 0.42 0.90 0.55 0.57 0.63 ' '0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred, labels=np.arange( len(iris.target_names)), target_names=iris.target_names) assert_equal(_format_report(report), expected_report) # print classification report with label detection expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 ' '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 ' '0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) assert_equal(_format_report(report), expected_report)
def test_classification_report_imbalanced_multiclass_with_digits(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 ' '0.92157 0.80851 0.85415 0.72010 24 versicolor ' '0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 ' '31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 ' '0.50831 20 avg / total 0.51375 0.53333 0.79733 ' '0.47310 0.57966 0.39788 75') report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, digits=5) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' '0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 ' '2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 ' '0.53 0.80 0.47 0.58 0.40 75') report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def test_classification_report_imbalanced_multiclass(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ("pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 " "0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 " "0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 " "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75") report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, ) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " "0.51 0.53 0.80 0.47 0.58 0.40 75") report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def test_classification_report_imbalanced_multiclass_with_digits(): """Test performance report with added digits in floating point values""" iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 ' '0.92157 0.80851 0.86409 0.74085 24 versicolor ' '0.33333 0.09677 0.86364 0.15000 0.43809 0.18727 31 ' 'virginica 0.41860 0.90000 0.54545 0.57143 0.62645 ' '0.37208 20 avg / total 0.51375 0.53333 0.79733 ' '0.47310 0.62464 0.41370 75') report = classification_report_imbalanced(y_true, y_pred, labels=np.arange( len(iris.target_names)), target_names=iris.target_names, digits=5) assert_equal(_format_report(report), expected_report) # print classification report with label detection expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 ' '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 ' '0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) assert_equal(_format_report(report), expected_report)
def test_classification_report_imbalanced_multiclass_with_digits(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ("pre rec spe f1 geo iba sup setosa 0.82609 0.79167 " "0.92157 0.80851 0.85415 0.72010 24 versicolor " "0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 " "31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 " "0.50831 20 avg / total 0.51375 0.53333 0.79733 " "0.47310 0.57966 0.39788 75") report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, digits=5, ) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 " "0.53 0.80 0.47 0.58 0.40 75") report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def train(self, train_data, qtz, auto): y = [] x = [] for line in train_data: y.append(line.split(" ")[0]) each_text = ' '.join(line.split(" ")[1:]) each_text = re.sub('\n', '', each_text) x.append(each_text) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=self.train_test_split_rate, random_state=0) train_interm_path = f"{WORK_PATH}/interm_data/train_interm.txt" test_interm_path = f"{WORK_PATH}/interm_data/test_interm.txt" train_ret = [] for x_, y_ in zip(x_train, y_train): train_ret.append(y_ + " " + x_ + "\n") test_ret = [] for x_, y_ in zip(x_test, y_test): test_ret.append(y_ + " " + x_ + "\n") with open(train_interm_path, "w", encoding="utf-8") as tr: tr.writelines(train_ret) with open(test_interm_path, "w", encoding="utf-8") as te: te.writelines(test_ret) if not auto: start_time = time.time() self.model = fasttext.train_supervised(input=train_interm_path, **self.params) print("Train Time: ", round(time.time() - start_time, 3), " s") else: start_time = time.time() self.model = fasttext.train_supervised(input=train_interm_path, thread=CPUs, verbose=2, autotuneValidationFile=test_interm_path) print("Train Time: ", round(time.time() - start_time, 3), " s") if qtz: start_time = time.time() self.model.quantize(train_interm_path, thread=CPUs, verbose=2, retrain=True) print("Retrain Time: ", round(time.time() - start_time, 3), " s") y_train_pred = [e[0] for e in self.model.predict(x_train)[0]] print("train acc:") self._print_results(*self.model.test(train_interm_path)) print("train label report:") print(metrics.classification_report_imbalanced(y_train, y_train_pred)) y_test_pred = [e[0] for e in self.model.predict(x_test)[0]] print("test acc:") self._print_results(*self.model.test(test_interm_path)) print("test label report:") print(metrics.classification_report_imbalanced(y_test, y_test_pred, labels=self.model.labels)) return self
def plot_metrics(parameters): """ Report baseline scores vs scores on real + fake data :param parameters: y_test_baseline, y_pred_baseline, scores_baseline, y_pred_gan, scores_gan """ [ y_test_baseline, y_pred_baseline, scores_baseline, y_pred_gan, scores_gan ] = parameters print( '\n', '############################################# BASELINE REPORT #############################################' ) print('Classification Report:', '\n', classification_report_imbalanced(y_test_baseline, y_pred_baseline)) print('Accuracy score: {}'.format( accuracy_score(y_pred_baseline, y_test_baseline))) precision = precision_score(y_pred_baseline, y_test_baseline) print('Precision score: {}'.format(precision)) recall = recall_score(y_pred_baseline, y_test_baseline) print('Recall score: {}'.format(recall)) print('F1 score: {}'.format(compute_F1(precision, recall))) print( '\n', '############################################# GAN (DATA AUGMENTATION) REPORT ##############################' ) print('Classification Report:', '\n', classification_report_imbalanced(y_test_baseline, y_pred_gan)) print('Accuracy score: {}'.format( accuracy_score(y_pred_gan, y_test_baseline))) precision = precision_score(y_pred_gan, y_test_baseline) print('Precision score: {}'.format(precision)) recall = recall_score(y_pred_gan, y_test_baseline) print('Recall score: {}'.format(recall)) print('F1 score: {}'.format(compute_F1(precision, recall))) fig = plt.figure(figsize=(8, 8)) fig.subplots_adjust(hspace=.5) plt.subplot(2, 2, 1) plot_cm(y_test_baseline, y_pred_baseline) plt.subplot(2, 2, 2) plot_cm(y_test_baseline, y_pred_gan) plt.subplot(2, 2, 3) plot_aucprc(y_test_baseline, scores_baseline) plt.subplot(2, 2, 4) plot_aucprc(y_test_baseline, scores_gan) plt.show()
def buildModel(X, y): # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2])) print X.shape, y.shape scaler = StandardScaler() print(scaler.fit(X)) scaled_train_x = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(scaled_train_x, y, random_state=19, test_size=0.3) bag = BalancedBaggingClassifier(n_estimators=200, random_state=19) svm = SVC(class_weight='balanced', random_state=19, decision_function_shape='ovo') neural = MLPClassifier(max_iter=500, random_state=19, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(49, 8, 4)) ada = AdaBoostClassifier(n_estimators=100, random_state=19) logistic = LogisticRegression(solver='lbfgs', max_iter=500) bag.fit(X_train, y_train) svm.fit(X_train, y_train) neural.fit(X_train, y_train) ada.fit(X_train, y_train) logistic.fit(X_train, y_train) # joblib.dump(bag,'bag.pkl') # joblib.dump(scaler,'scaler.pkl') y_pred = bag.predict(X_test) y_pred2 = svm.predict(X_test) y_pred3 = neural.predict(X_test) y_pred4 = ada.predict(X_test) y_pred5 = logistic.predict(X_test) print matthews_corrcoef(y_test, y_pred) print matthews_corrcoef(y_test, y_pred2) print matthews_corrcoef(y_test, y_pred3) print matthews_corrcoef(y_test, y_pred4) print matthews_corrcoef(y_test, y_pred5) print confusion_matrix(y_test, y_pred) print confusion_matrix(y_test, y_pred2) print confusion_matrix(y_test, y_pred3) print confusion_matrix(y_test, y_pred4) print confusion_matrix(y_test, y_pred5) print(classification_report_imbalanced(y_test, y_pred)) print(classification_report_imbalanced(y_test, y_pred2)) print(classification_report_imbalanced(y_test, y_pred3)) print(classification_report_imbalanced(y_test, y_pred4)) print(classification_report_imbalanced(y_test, y_pred5))
def report_scores(parameters): """ Report accuracy, precision, recall and F1 scores. Plot confusion matrix and AUC curves :param parameters: y_test, y_pred and scores :return: """ [y_test, y_pred, scores, show_graph] = parameters print('Classification Report:', '\n', classification_report_imbalanced(y_test, y_pred)) print('Accuracy score: {}'.format(accuracy_score(y_pred, y_test))) precision = precision_score(y_pred, y_test) print('Precision score: {}'.format(precision)) recall = recall_score(y_pred, y_test) print('Recall score: {}'.format(recall)) print('F1 score: {}'.format(compute_F1(precision, recall))) if show_graph: fig = plt.figure(figsize=(6, 5)) fig.subplots_adjust(hspace=.5) plt.subplot(2, 1, 1) plot_cm(y_test, y_pred) plt.subplot(2, 1, 2) plot_aucprc(y_test, scores) plt.show() else: print('Confusion Matrix: ', '\n', confusion_matrix(y_test, y_pred), '\n')
def evaluate_model(model_str): X, y = load_train() test_x, test_y = load_test() X = np.concatenate([X, test_x], axis=0) y = np.concatenate([y, test_y], axis=0) s = StandardScaler() X = s.fit_transform(X) X, test_x, y, test_y = train_test_split(X, y, test_size=0.3, shuffle=False) model = fit_model(X, y, model=model_str) y_hat = model.predict(test_x) # Calculate ROC-AUC score y_pred_prob = model.predict_proba(test_x)[:, 1] auc_score = roc_auc_score(test_y, y_pred_prob) # AUC with CV cv_scores = cross_val_score(model, X, y, cv=10) mean_cv = np.mean(cv_scores) # AccuracyScore accu = accuracy_score(test_y, y_hat) # Balanced Accuracy Score balanced_accuracy = balanced_accuracy_score(test_y, y_hat) simple = make_simple_report(model_str, accu, balanced_accuracy, auc_score, mean_cv) imb_report = classification_report_imbalanced(test_y, y_hat) return simple, imb_report
def main(): fig, axes = plt.subplots(1, 3, sharey=True, sharex=True) #fig.suptitle('Resample approaches') for ax, title, model in zip(axes.flat, ['No resample', 'Oversample', 'Undersample', ], [no_resample, oversample, undersample]): y, y_pred, c = model() print(title) print(imetrics.classification_report_imbalanced(y, y_pred)) acc = metrics.accuracy_score(y, y_pred) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title(f'{title}\naccuracy={acc:.3f}') count = class_counter(y) fig.suptitle('Population: ' + ', '.join([f'{key}: {count[key]*100:.1f}%' for key in labels])) fig.tight_layout() fig.savefig('./different_resampling.pdf', dpi=92, bbox_inches='tight') plt.show()
def svm(X_tr, Y_tr, X_te, Y_te): if Y_tr.shape[1] > 1: Y_tr = np.argmax(Y_tr, axis=1) Y_te = np.argmax(Y_te, axis=1) parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-2, 1e-1, 1], 'C': [1] }] #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] svc = svm.SVC() clf = GridSearchCV(svc, parameters, cv=5) clf.fit(X_tr, Y_tr) y_pred = clf.predict(X_te) acc = accuracy_score(Y_te, y_pred) fpr_vot, tpr_vot, _ = roc_curve(Y_te, y_pred, pos_label=1, drop_intermediate=False) roc_auc_vot = auc(fpr_vot, tpr_vot) cmat = classification_report_imbalanced(Y_te, y_pred) print("SVM") print(cmat) cnf_matrix = confusion_matrix(Y_te, y_pred) print(cnf_matrix) geo = geometric_mean_score(Y_te, y_pred) f1 = f1_score(Y_te, y_pred, average='micro') print('The geometric mean is {}'.format(geo)) print('The auc is {}'.format(roc_auc_vot)) print('The f1 is {}'.format(f1)) return acc
def apply_ml_model(X_train_input, y_train_input, X_test_input, y_test_input): models = ['LREG','RFC','Tree','Balanced RFC'] scores = [] # Specify the target classes classes = ["No re-admission","Re-admission in < 30 days"] for model in models: if model == 'LREG': model_select = LogisticRegression(solver='lbfgs', max_iter=500, random_state=78) elif model == 'RFC': model_select = RandomForestClassifier(n_estimators= 128, random_state=78) elif model == 'Tree': model_select = tree.DecisionTreeClassifier(random_state=78) elif model == 'Balanced RFC': model_select = BalancedRandomForestClassifier(n_estimators=128, random_state=78) model_select.fit(X_train_input, y_train_input) y_pred = model_select.predict(X_test_input) # Create a DataFrame from the confusion matrix. cm = confusion_matrix(y_test_input, y_pred) # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_pred) scores.append(acc_score) print(f"Model: {model}") # Displaying results print("Confusion Matrix") cm_df = pd.DataFrame( cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]) print(cm_df) print(f"Accuracy Score : {acc_score}\n") print("Classification Report") print(classification_report_imbalanced(y_test_input, y_pred))
def svm(X_tr, Y_tr, X_te, Y_te): # bw = (len(X_tr)/2.0)**0.5 #default value in One-class SVM # gamma = 1/(2*bw*bw) X_tr, X_te = normalize_data(X_tr, X_te, "minmax") if Y_tr.shape[1] > 1: Y_tr = np.argmax(Y_tr, axis=1) Y_te = np.argmax(Y_te, axis=1) # parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], # 'C': [1]}] # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] # svc = svm.SVC() # clf = GridSearchCV(svc, parameters,cv= 5) # clf = SVC (gamma = gamma) clf = LinearSVC(random_state=0) clf.fit(X_tr, Y_tr) start = time.time() y_pred = clf.predict(X_te) end = time.time() elapsed = (end - start) / float(len(X_te)) acc = accuracy_score(Y_te, y_pred) fpr_vot, tpr_vot, _ = roc_curve(Y_te, y_pred, pos_label=1, drop_intermediate=False) roc_auc_vot = auc(fpr_vot, tpr_vot) cmat = classification_report_imbalanced(Y_te, y_pred) print("SVM") geo = geometric_mean_score(Y_te, y_pred) f1 = f1_score(Y_te, y_pred, average='macro') print('The auc is {} '.format(roc_auc_vot)) return roc_auc_vot, elapsed
def test_classification_report_imbalanced_dict(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, output_dict=True, ) outer_keys = set(report.keys()) inner_keys = set(report[0].keys()) expected_outer_keys = { 0, 1, 2, "avg_pre", "avg_rec", "avg_spe", "avg_f1", "avg_geo", "avg_iba", "total_support", } expected_inner_keys = {"spe", "f1", "sup", "rec", "geo", "iba", "pre"} assert outer_keys == expected_outer_keys assert inner_keys == expected_inner_keys
def randomforest(X_tr, Y_tr, X_te, Y_te): if Y_tr.shape[1] > 1: Y_tr = np.argmax(Y_tr, axis=1) Y_te = np.argmax(Y_te, axis=1) rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=40, oob_score = True) param_grid = { 'n_estimators': [40, 100]} CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid) CV_rfc.fit(X_tr, Y_tr) #print CV_rfc.best_params_ #clf = RandomForestClassifier(n_estimators=150, random_state =42) #clf.fit(X_tr, Y_tr) y_pred = CV_rfc.predict(X_te) fpr_vot , tpr_vot , _ = roc_curve(Y_te , y_pred , pos_label =1, drop_intermediate=False) roc_auc_vot = auc(fpr_vot , tpr_vot) cmat = classification_report_imbalanced(Y_te, y_pred) #print (cmat.diagonal()/cmat.sum(axis=1)) print (cmat) print('The geometric mean is {}'.format(geometric_mean_score(Y_te,y_pred))) print('The auc is {}'.format(roc_auc_vot)) print('The f1 is {}'.format(f1_score(Y_te, y_pred, average='weighted'))) return CV_rfc, fpr_vot, tpr_vot, roc_auc_vot
def decisiontree(X_tr, Y_tr, X_te, Y_te): if Y_tr.shape[1] > 1: Y_tr = np.argmax(Y_tr, axis=1) Y_te = np.argmax(Y_te, axis=1) param_grid = {'max_depth': [5, 6, 7, 8, 9, 10, 50, 100]} tree = GridSearchCV(DecisionTreeClassifier(), param_grid) tree.fit(X_tr, Y_tr) y_pred = tree.predict(X_te) acc = accuracy_score(Y_te, y_pred) fpr_vot, tpr_vot, _ = roc_curve(Y_te, y_pred, pos_label=1, drop_intermediate=False) roc_auc_vot = auc(fpr_vot, tpr_vot) cmat = classification_report_imbalanced(Y_te, y_pred) print("Decision tree") print(cmat) cnf_matrix = confusion_matrix(Y_te, y_pred) print(cnf_matrix) geo = geometric_mean_score(Y_te, y_pred) f1 = f1_score(Y_te, y_pred, average='micro') print('The geometric mean is {}'.format(geo)) print('The auc is {}'.format(roc_auc_vot)) print('The f1 is {}'.format(f1)) return acc
def decisiontree(X_tr, Y_tr, X_te, Y_te): # X_tr, X_te = normalize_data(X_tr, X_te, "minmax") if Y_tr.shape[1] > 1: Y_tr = np.argmax(Y_tr, axis=1) Y_te = np.argmax(Y_te, axis=1) param_grid = {'max_depth': np.arange(3, 6)} tree = GridSearchCV(DecisionTreeClassifier(), param_grid) tree.fit(X_tr, Y_tr) start = time.time() y_pred = tree.predict(X_te) end = time.time() elapsed = (end - start) / float(len(X_te)) acc = accuracy_score(Y_te, y_pred) fpr_vot, tpr_vot, _ = roc_curve(Y_te, y_pred, pos_label=1, drop_intermediate=False) roc_auc_vot = auc(fpr_vot, tpr_vot) cmat = classification_report_imbalanced(Y_te, y_pred) print("Decision tree") # print (cmat) geo = geometric_mean_score(Y_te, y_pred) f1 = f1_score(Y_te, y_pred, average='micro') print('The auc is {} '.format(roc_auc_vot)) return roc_auc_vot, elapsed
def test_classification_report_imbalanced_multiclass_with_unicode_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array([u"blue\xa2", u"green\xa2", u"red\xa2"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = (u'pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 ' u'0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 ' u'red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total ' u'0.51 0.53 0.80 0.47 0.58 0.40 75') if np_version[:3] < (1, 7, 0): with pytest.raises(RuntimeError, match="NumPy < 1.7.0"): classification_report_imbalanced(y_true, y_pred) else: report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def makePipelineMultinomialNB(X_train, Y_train, X_test, Y_test): pipe = make_pipeline(TfidfVectorizer(), MultinomialNB()) pipe.fit(X_train, Y_train) y_pred = pipe.predict(X_test) print(accuracy_score(Y_test, y_pred)) print(classification_report_imbalanced(Y_test, y_pred))
def print_classification_report(clf, X_train, X_test, y_train, y_test): """Fit classifier and print classification report.""" clf.fit(X_train, y_train) y_pred = clf.predict(X_test) clf_name = clf.__class__.__name__ div = '=' * len(clf_name) title = f'\n{div}\n{clf_name}\n{div}\n' print(title, classification_report_imbalanced(y_test, y_pred))
def test_classification_report_imbalanced_multiclass_with_unicode_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array([u"blue\xa2", u"green\xa2", u"red\xa2"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = (u'pre rec spe f1 geo iba sup blue\xa2 0.83 0.79 ' u'0.92 0.81 0.86 0.74 24 green\xa2 0.33 0.10 0.86 ' u'0.15 0.44 0.19 31 red\xa2 0.42 0.90 0.55 0.57 0.63 ' u'0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75') if np_version[:3] < (1, 7, 0): with raises(RuntimeError, match="NumPy < 1.7.0"): classification_report_imbalanced(y_true, y_pred) else: report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def test_classification_report_imbalanced_multiclass_with_unicode_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array(["blue\xa2", "green\xa2", "red\xa2"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = ('pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 ' '0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 ' 'red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total ' '0.51 0.53 0.80 0.47 0.58 0.40 75') if np_version[:3] < (1, 7, 0): with pytest.raises(RuntimeError, match="NumPy < 1.7.0"): classification_report_imbalanced(y_true, y_pred) else: report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def makePipelineBernoulliNB(X_train, Y_train, X_test, Y_test, binarize): pipe = make_pipeline(TfidfVectorizer(), BernoulliNB(binarize=binarize)) pipe.fit(X_train, Y_train) y_pred = pipe.predict(X_test) print('binarize', binarize, accuracy_score(Y_test, y_pred)) print(classification_report_imbalanced(Y_test, y_pred))
def Print_Result_Metrics(labels_test, predicted, targetnames, model_name): ''' Print Metrics after Training etc. ''' print('\n- - - - - RESULT METRICS', model_name, '- - - - -') print('Exact Accuracy: ', metrics.accuracy_score(labels_test, predicted)) print( classification_report_imbalanced(labels_test, predicted, target_names=targetnames)) print(metrics.confusion_matrix(labels_test, predicted))
def makePipelineImpGaussianNB(X_train, Y_train, X_test, Y_test): pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), GaussianNB()) pipe.fit(X_train, Y_train) y_pred = pipe.predict(X_test) print(accuracy_score(Y_test, y_pred)) print(classification_report_imbalanced(Y_test, y_pred))
def print_evaluation_results(y_test, predictions): print() print("!!!!!!!!!!!!!!!!!!!!! EVALUATION RESULTS !!!!!!!!!!!!!!!!!!!!!!") print("Accuracy Score ", accuracy_score(y_test, predictions)) print("Hamming Loss ", hamming_loss(y_test, predictions)) print("Jaccard Similarity Score ", jaccard_similarity_score(y_test, predictions)) print(confusion_matrix(y_test, predictions)) # print(classification_report(y_test, predictions)) print(classification_report_imbalanced(y_test, predictions)) print()
def test_classification_report_imbalanced_multiclass_with_string_label(): y_true, y_pred, _ = make_prediction(binary=False) y_true = np.array(["blue", "green", "red"])[y_true] y_pred = np.array(["blue", "green", "red"])[y_pred] expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 ' '0.85 0.72 24 green 0.33 0.10 0.86 0.15 0.29 0.08 31 ' 'red 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total ' '0.51 0.53 0.80 0.47 0.58 0.40 75') report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report expected_report = ('pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 0.85 ' '0.72 24 b 0.33 0.10 0.86 0.15 0.29 0.08 31 c 0.42 ' '0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 0.53 ' '0.80 0.47 0.58 0.40 75') report = classification_report_imbalanced( y_true, y_pred, target_names=["a", "b", "c"]) assert _format_report(report) == expected_report
def evaluate_performance(y_pred, y_label): """Compute and return the prediction performance.""" precision = precision_score(y_label, y_pred) recall = recall_score(y_label, y_pred) f1 = f1_score(y_label, y_pred) f05 = fbeta_score(y_label, y_pred, beta=0.5) conf = confusion_matrix(y_label, y_pred) / len(y_pred) report = classification_report_imbalanced(y_true=y_label, y_pred=y_pred) print(report) print(f'f1: {f1} // f0.5: {f05}') return precision, recall, f1, f05, conf, report
def confusion_plot(pred, y_true): sns.set(rc={'figure.figsize': (5, 4)}) fault_labels = np.unique(y_true) print(fault_labels) cm_array = confusion_matrix(y_true, pred, labels=fault_labels) df_cm = pd.DataFrame(cm_array, index=fault_labels, columns=fault_labels) sns.heatmap(df_cm, annot=True) plt.show() print(classification_report_imbalanced(np.array(y_true), np.array(pred))) return plt
def test_classification_report_imbalanced_multiclass_with_string_label(): y_true, y_pred, _ = make_prediction(binary=False) y_true = np.array(["blue", "green", "red"])[y_true] y_pred = np.array(["blue", "green", "red"])[y_pred] expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 ' '0.81 0.86 0.74 24 green 0.33 0.10 0.86 0.15 0.44 ' '0.19 31 red 0.42 0.90 0.55 0.57 0.63 0.37 20 ' 'avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) assert_equal(_format_report(report), expected_report) expected_report = ('pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 ' '0.86 0.74 24 b 0.33 0.10 0.86 0.15 0.44 0.19 31 ' 'c 0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total ' '0.51 0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced( y_true, y_pred, target_names=["a", "b", "c"]) assert_equal(_format_report(report), expected_report)
def test_classification_report_imbalanced_multiclass_with_long_string_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array(["blue", "green" * 5, "red"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 ' '0.85 0.72 24 greengreengreengreengreen 0.33 0.10 ' '0.86 0.15 0.29 0.08 31 red 0.42 0.90 0.55 0.57 0.70 ' '0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75') report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
def test_classification_report_imbalanced_multiclass(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ('pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 ' '0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 ' '0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 ' '0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75') report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' '0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 ' '2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total ' '0.51 0.53 0.80 0.47 0.58 0.40 75') report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report
X, y = ozone.data, ozone.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) bagging = BaggingClassifier(random_state=0) balanced_bagging = BalancedBaggingClassifier(random_state=0) print('Class distribution of the training set: {}'.format(Counter(y_train))) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) print('Class distribution of the test set: {}'.format(Counter(y_test))) print('Classification results using a bagging classifier on imbalanced data') y_pred_bagging = bagging.predict(X_test) print(classification_report_imbalanced(y_test, y_pred_bagging)) cm_bagging = confusion_matrix(y_test, y_pred_bagging) plt.figure() plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target), title='Confusion matrix using BaggingClassifier') print('Classification results using a bagging classifier on balanced data') y_pred_balanced_bagging = balanced_bagging.predict(X_test) print(classification_report_imbalanced(y_test, y_pred_balanced_bagging)) cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging) plt.figure() plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(ozone.target), title='Confusion matrix using BalancedBaggingClassifier') ############################################################################### # Turning the balanced bagging classifier into a balanced random forest
from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from imblearn.datasets import make_imbalance from imblearn.under_sampling import NearMiss from imblearn.pipeline import make_pipeline from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Create a folder to fetch the dataset iris = load_iris() X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 50, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=RANDOM_STATE) print('Training target statistics: {}'.format(Counter(y_train))) print('Testing target statistics: {}'.format(Counter(y_test))) # Create a pipeline pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) pipeline.fit(X_train, y_train) # Classify and report the results print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
from imblearn import over_sampling as os from imblearn import pipeline as pl from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Generate a dataset X, y = datasets.make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE) pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) # Train the classifier with balancing pipeline.fit(X_train, y_train) # Test the classifier and get the prediction y_pred_bal = pipeline.predict(X_test) # Show the classification report print(classification_report_imbalanced(y_test, y_pred_bal))