def LogisticGridSearch_OLD(): # C=1 is best cs = 10.0**np.arange(-1,2,0.25) aucs = [] for c in cs: clf = LogisticRegression(penalty='l1',C=c).fit(f_train, y_train) probs = clf.predict_proba(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs[:,1]) roc_auc = auc(fpr,tpr) cstr = '%0.2e'%c myplt = st.plotROC(fpr,tpr,roc_auc, figure=False, show=False, returnplt=True, showlegend=False, title='Grid Search - Logistic Regression ROC Curve') aucs.append(roc_auc) best = 0 for i in range(len(cs)): if aucs[i] > aucs[best]: best = i c = cs[best] clf = LogisticRegression(penalty='l1',C=c).fit(f_train, y_train) probs = clf.predict_proba(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs[:,1]) myplt = st.plotROC(fpr,tpr,roc_auc, legendlabel='Best C = %0.2e' % c, figure=False, show=False, returnplt=True, showlegend=True, title='Grid Search - Logistic Regression ROC Curve') myplt.show() return clf
def MultinomialNaiveBayesGridSearch_OLD(): # C=1 is best cs = 10.0**np.arange(-9,2,0.5) aucs = [] for c in cs: clf = MultinomialNB(alpha=c).fit(f_train, y_train) probs = clf.predict_proba(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs[:,1]) roc_auc = auc(fpr,tpr) cstr = '%0.2e'%c myplt = st.plotROC(fpr,tpr,roc_auc, figure=False, show=False, returnplt=True, showlegend=False, title='Grid Search - Multinomial Naive Bayes ROC Curve') aucs.append(roc_auc) best = 0 for i in range(len(cs)): if aucs[i] > aucs[best]: best = i c = cs[best] clf = MultinomialNB(alpha=c).fit(f_train, y_train) probs = clf.predict_proba(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs[:,1]) myplt = st.plotROC(fpr,tpr,roc_auc, legendlabel='Best alpha = %0.2e' % c, figure=False, show=False, returnplt=True, showlegend=True, title='Grid Search - Multinomial Naive Bayes ROC Curve') myplt.show() return clf
def svm_model(train_x_pca_df, train_y, validation_x_pca_df, validation_y): """ This function is to build two kinds of svm models with and without setting class weight and compare their performances. """ # Build svm model: uw for unweighted, w for weighted my_svm_uw = svm.SVC(C=0.001, kernel="linear", probability=True) my_svm_w = svm.SVC(C=0.001, kernel="linear", probability=True, class_weight="auto") # calculate the predicted probability proba_svm_uw = my_svm_uw.fit(train_x_pca_df, train_y).predict_proba(validation_x_pca_df) proba_svm_w = my_svm_w.fit(train_x_pca_df, train_y).predict_proba(validation_x_pca_df) # calculate AUC auc_uw = roc_auc_score(validation_y, proba_svm_uw[:, 1]) auc_w = roc_auc_score(validation_y, proba_svm_w[:, 1]) # prepare to plot ROC curve fpr_svm_w, tpr_svm_w, thresholds_svm_w = roc_curve(validation_y, proba_svm_w[:, 1]) fpr_svm_uw, tpr_svm_uw, thresholds_svm_uw = roc_curve(validation_y, proba_svm_uw[:, 1]) # plot ROC curve fig = plt.figure() ax = fig.add_subplot(111) ax.set_xlabel("False Positive Rate") ax.set_ylabel("True Positive Rate") plt.title("ROC Curve for SVM with & without setting class weight") plt.plot(fpr_svm_w, tpr_svm_w, "grey", lw=2.0, label="SVM_w ({0:.3f})".format(auc_w)) plt.plot(fpr_svm_uw, tpr_svm_uw, "g", lw=2.0, label="SVM_uw ({0:.3f})".format(auc_uw)) plt.legend(loc=4) plt.show()
def SGDGridSearch_OLD(): # C=1 is best cs = 10.0**np.arange(-9,9,1) aucs = [] for c in cs: clf = SGDClassifier(penalty='l1',alpha=c).fit(f_train, y_train) probs = clf.decision_function(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs) roc_auc = auc(fpr,tpr) cstr = '%0.2e'%c myplt = st.plotROC(fpr,tpr,roc_auc, figure=False, show=False, returnplt=True, showlegend=False, title='Grid Search - SGD Classifier ROC Curve') aucs.append(roc_auc) best = 0 for i in range(len(cs)): if aucs[i] > aucs[best]: best = i c = cs[best] clf = SGDClassifier(penalty='l1',alpha=c).fit(f_train, y_train) probs = clf.decision_function(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs) myplt = st.plotROC(fpr,tpr,roc_auc, legendlabel='Best C = %0.2e' % c, figure=False, show=False, returnplt=True, showlegend=True, title='Grid Search - SGD Classifier ROC Curve') myplt.show() return clf, aucs
def test_roc_curve_hard(): # roc_curve for hard decisions y_true, pred, probas_pred = make_prediction(binary=True) # always predict one trivial_pred = np.ones(y_true.shape) fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.50, decimal=2) assert_equal(fpr.shape, tpr.shape) assert_equal(fpr.shape, thresholds.shape) # always predict zero trivial_pred = np.zeros(y_true.shape) fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.50, decimal=2) assert_equal(fpr.shape, tpr.shape) assert_equal(fpr.shape, thresholds.shape) # hard decisions fpr, tpr, thresholds = roc_curve(y_true, pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.78, decimal=2) assert_equal(fpr.shape, tpr.shape) assert_equal(fpr.shape, thresholds.shape)
def run_regression(X_train, Y_train, X_test, Y_test, lead, lag): num_crossval = 10 start_time = time.time() logreg = Pipeline([('scale', StandardScaler()), ('logreg', linear_model.LogisticRegression())]) #do cross-validation try: auc_crossval = np.mean(cross_validation.cross_val_score(logreg, np.array(X_train), np.array(Y_train), scoring='roc_auc', cv=num_crossval)) except: auc_crossval = 0.0 #do training on train logreg.fit(X_train, Y_train) desired_label = 0 # want to predict if student will dropout desired_label_index = logreg.steps[-1][1].classes_.tolist().index(desired_label) try: predicted_probs = logreg.predict_proba(X_train) fpr, tpr, thresholds = roc_curve(Y_train, predicted_probs[:, desired_label_index], pos_label=desired_label) auc_train = auc(fpr, tpr) except: auc_train = 0.0 try: predicted_probs = logreg.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(Y_test, predicted_probs[:, desired_label_index], pos_label=desired_label) auc_test = auc(fpr, tpr) except: auc_test = 0.0 print "ran logistic regression for lead %s lag %s in %s seconds" % (lead, lag, time.time() - start_time) return (float(auc_train), float(auc_test), auc_crossval)
def eva(fff1, fff2, fff3, fff4, rocfile): truth = open(fff1) pred = open(fff2) y = [float(line.split(' ',1)[0]) for line in truth] p = [float(line) for line in pred] fpr, tpr, thresholds = roc_curve(y, p, pos_label=1) print auc(fpr, tpr) plt.figure(figsize=(4, 4), dpi=80) x = [0.0, 1.0] plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random') plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.xlabel("FPR", fontsize=14) plt.ylabel("TPR", fontsize=14) plt.title("ROC Curve", fontsize=14) plt.plot(fpr, tpr, linewidth=2, label = "adaboost_fea1") truth = open(fff3) pred = open(fff4) y = [float(line.split(' ',1)[0]) for line in truth] p = [float(line) for line in pred] fpr, tpr, thresholds = roc_curve(y, p, pos_label=1) print auc(fpr, tpr) plt.plot(fpr, tpr, linewidth=2, label = "adaboost_fea2") plt.legend(fontsize=10, loc='best') plt.tight_layout() plt.savefig(rocfile)
def roc_calculation(y_pred, y_test, model, type = sys.argv[2]): plt.figure() if type == 'gender': fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=0) roc_auc = metrics.auc(fpr, tpr) plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})' ''.format(GENDER_CLASSES[0], roc_auc)) fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1) roc_auc = metrics.auc(fpr, tpr) plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})' ''.format(GENDER_CLASSES[1], roc_auc)) else: for i in [0,1,2,3,4]: fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=i) roc_auc = metrics.auc(fpr, tpr) plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})' ''.format(AGE_CLASSES[i], roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() plt.savefig('experiments/fensemble-roc-'+model+'.png')
def main(): (X,y) = skd.make_classification() N = X.shape[0] X = np.append(X,np.ones((N,1)),axis=1) y = 2*y-1 skf = StratifiedKFold(y,5) for train,test in skf: X_train = X[train,:] y_train = y[train] X_test = X[test,:] y_test = y[test] C = 0.01 # dual co-ordinate descent SVM clf = SVMCD(C) clf.fit(X_train,y_train,w_prior=np.ones(21)) pred = clf.decision_function(X_test) score = clf.score(X_test,y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, pred) print score, metrics.auc(fpr, tpr), "//", w1 = clf.w; # standard svm clf = SVC(C=C,kernel='linear') clf.fit(X_train, y_train) pred = clf.decision_function(X_test) score = clf.score(X_test,y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, pred) print score, metrics.auc(fpr, tpr) w2 = clf.coef_ w2.shape = (21,)
def plotROC(y_score, labels, outpdf): n_classes = labels.shape[1] # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(labels[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(labels.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot of a ROC curve for a specific class plt.figure() plt.figure(figsize = (6,6)) # Plot ROC curve for i in range(4): plt.plot(fpr[i], tpr[i], label='' + classifiers[i]+ ' AUC={1:0.2f}' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False positive rate(1-Specificity)') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc="lower right") savefig(outpdf) plt.show()
def makeROCPlot(self, filename, title, labels, roc_data): y = np.array(self.create_binary_label_matrix(labels)) n_classes = y.shape[1] fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y[:, i], roc_data[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), roc_data.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot ROC curve plt.figure() plt.plot(fpr["micro"], tpr["micro"],label='Average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"])) for i in range(n_classes): plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'.format(i+1, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(title) plt.legend(loc="lower right") plt.savefig("figs/"+filename+'.png',bbox_inches='tight') #plt.show() plt.clf() return roc_auc
def calculate_roc(truth, predictions): lb_truth = label_binarize(truth.iloc[:, -1].astype(int), np.arange(n_classes)) lb_prediction = label_binarize(predictions.iloc[:, -1].astype(int), np.arange(n_classes)) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(len(letter_set)): fpr[i], tpr[i], _ = roc_curve(lb_truth[:, i], lb_prediction[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(lb_truth.ravel(), lb_prediction.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return fpr, tpr, roc_auc
def compute_rocauc(self): """ :return: """ # Binarize the output y_test = label_binarize(self.y_test, classes=list(range(self.n_classes))) # Compute ROC curve and ROC area for each class y_score = self.clf.predict_proba(self.X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(self.n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) self.report["roc_auc"] = dict( fpr={str(k): v.tolist() for k, v in fpr.items()}, tpr={str(k): v.tolist() for k, v in tpr.items()}, roc_auc={str(k): v.tolist() for k, v in roc_auc.items()} )
def classification_metrics (targets, preds, probs=None): if probs != None: fpr, tpr, thresholds = roc_curve(targets, probs[:, 1], 1) roc_auc = auc(fpr, tpr) else: fpr, tpr, thresholds = roc_curve(targets, preds, 1) roc_auc = auc(fpr, tpr) cm = confusion_matrix(targets, preds) #accuracy acc = accuracy_score(targets, preds) #recall? True Positive Rate or Sensitivity or Recall sens = recall_score(targets, preds) #precision prec = precision_score(targets, preds) #f1-score f1 = f1_score(targets, preds, np.unique(targets), 1) tnr = 0.0 #True Negative Rate or Specificity (tn / (tn+fp)) if len(cm) == 2: spec = float(cm[0,0])/(cm[0,0] + cm[0,1]) return acc, sens, spec, prec, f1, fpr, tpr, roc_auc
def GetReport(model, PlotROC, X_test, y_test): #Results=pd.DataFrame(zip(RelFeats, np.transpose(model.coef_))) accur=model.score(X_test, y_test) predicted = model.predict(X_test) ConfMat=metrics.confusion_matrix(y_test, predicted) Rep=metrics.classification_report(y_test, predicted) if (y_test[0]=='0b0') | (y_test[0]=='0b1'): Bin_to_Int=np.vectorize(int) false_positive_rate, true_positive_rate, thresholds = roc_curve(Bin_to_Int(y_test,2), model.predict_proba(X_test)[:,1]) else: false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1]) roc_auc = auc(false_positive_rate, true_positive_rate) print Rep print 'Accuracy = '+str(accur) print 'AUC = '+str(roc_auc) if PlotROC: plt.figure(11) plt.clf() plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc) plt.legend(loc='lower right', fontsize=20) plt.plot([0,1],[0,1],'r--') plt.xlim([-0.05,1.05]) plt.ylim([-0.05,1.05]) plt.ylabel('True Positive Rate', fontsize=20) plt.xlabel('False Positive Rate', fontsize=20) plt.xticks(fontsize=12) plt.yticks(fontsize=12) plt.show() return accur, ConfMat, Rep, roc_auc
def CV(clf, X, y, n_folds=10): """ returns gini values and classifier """ from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import roc_curve, auc from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_curve, auc import pandas as pd cv = StratifiedKFold(y, n_folds=n_folds) auccka = [] try: for train_ix, test_ix in cv: clf.fit(X.ix[train_ix,:], y[train_ix]) y_pred = clf.predict_proba(X.ix[test_ix,:])[:,1] y_true = y[test_ix] fpr, tpr, tresholds = roc_curve(y_true, y_pred) auccka.append(auc(fpr,tpr)) except Exception: # treba kdyz vyjde log(0) v nejakem foldu X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) # uprava: train_test_split vraci ndarray, ja chci ale DataFrame: X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) X_train.columns = X.columns X_test.columns = X.columns # konec upravy clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:,1] y_true = y_test fpr, tpr, tresholds = roc_curve(y_true, y_pred) auccka.append(auc(fpr, tpr)) gini = [2*auc-1 for auc in auccka] return gini, clf
def AUC(test_labels, predicted_labels, n_classes): y_test = testProbVector(n_classes, test_labels) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(0,n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:,i], predicted_labels[:,i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), predicted_labels.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return np.asarray(roc_auc)
def experiment_anomaly_detection(train, test, comb, num_train, anom_prob, labels): phi = calc_feature_vecs(comb.X) print phi.size # bayes classifier (DIMS, N) = phi.size w_bayes = co.matrix(1.0, (DIMS, 1)) pred = w_bayes.trans()*phi[:,num_train:] (fpr, tpr, thres) = metric.roc_curve(labels[num_train:], pred.trans()) bayes_auc = metric.auc(fpr, tpr) # train one-class svm kern = Kernel.get_kernel(phi[:,0:num_train], phi[:,0:num_train]) ocsvm = OCSVM(kern, C=1.0/(num_train*anom_prob)) ocsvm.train_dual() kern = Kernel.get_kernel(phi, phi) (oc_as, foo) = ocsvm.apply_dual(kern[num_train:,ocsvm.get_support_dual()]) (fpr, tpr, thres) = metric.roc_curve(labels[num_train:], oc_as) base_auc = metric.auc(fpr, tpr) if (base_auc<0.5): base_auc = 1.0-base_auc # train structured anomaly detection #sad = StructuredOCSVM(train, C=1.0/(num_train*anom_prob)) sad = StructuredOCSVM(train, C=1.0/(num_train*0.5)) (lsol, lats, thres) = sad.train_dc(max_iter=50) (pred_vals, pred_lats) = sad.apply(test) (fpr, tpr, thres) = metric.roc_curve(labels[num_train:], pred_vals) auc = metric.auc(fpr, tpr) if (auc<0.5): auc = 1.0-auc return (auc, base_auc, bayes_auc)
def generate_roc_graph(data=[]): """ generate ROC curve of detection True positive/False negative rate """ from sklearn import metrics if not data: return shiva_score_probs = map(lambda a: a[0], data) spamass_score_probs = map(lambda a: a[1], data) derived_results = map(lambda a: a[2], data) fpr_shiva, tpr_shiva, _ = metrics.roc_curve(derived_results, shiva_score_probs, pos_label=1) fpr_spamass, tpr_spamass, _ = metrics.roc_curve(derived_results, spamass_score_probs, pos_label=1) roc_auc_shiva = metrics.auc(fpr_shiva, tpr_shiva) roc_auc_spamass = metrics.auc(fpr_spamass, tpr_spamass) plot.figure() plot.plot(fpr_shiva, tpr_shiva, label="ROC curve SHIVA (area = %0.2f)" % roc_auc_shiva) plot.plot(fpr_spamass, tpr_spamass, label="ROC curve spamassassin (area = %0.2f)" % roc_auc_spamass) plot.plot([0, 1], [0, 1], "k--") plot.xlim([0.0, 1.0]) plot.ylim([0.0, 1.05]) plot.xlabel("False Positive Rate") plot.ylabel("True Positive Rate") plot.title("Shiva honeypot classification ROC") plot.legend(loc="lower right") plot.savefig("../../../web/images/roc_graph.png", bbox_inches="tight") plot.close()
def calc_auc(model, y_test, y_score, auctype = "ROC"): y_score = 1 / ( 1 + np.exp(-y_score) ) # sigmoid it! n_classes = y_test.shape[1] # 164 if auctype == "ROC": fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) return roc_auc["micro"] elif auctype == "PR": prec = dict() rec = dict() pr_auc = dict() for i in range(n_classes): prec[i], rec[i], _ = precision_recall_curve(y_test[:,i], y_score[:,i]) pr_auc[i] = auc(rec[i], prec[i]) # Compute micro-average prec-rec curve and prec-rec AUC prec["micro"], rec["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel()) pr_auc["micro"] = auc(rec["micro"], prec["micro"]) return pr_auc["micro"]
def roc_curve_metric(targets, predictions, **multi_optional_args): """ For multi class classifiers, two different types of roc curves are made. TODO: explain what these are """ assert targets.ndim == predictions.ndim == 2 assert targets.shape == predictions.shape if targets.shape[1] == 1: predictions = predictions.flatten() targets = targets.flatten() roc_for_data = roc_curve(targets, predictions) else: roc_for_data = {} targets = np.argmax(targets, axis=1).flatten() for i in range(predictions.shape[1]): roc_for_data[i] = {} predictions_for_class = predictions[:, i] roc_for_data[i]['specific_classification'] = roc_curve(targets, predictions_for_class, pos_label=i) predictions_for_class = np.zeros(len(predictions)) highest_class = np.argmax(predictions, axis=1) predictions_for_class[highest_class == i] = predictions[:, highest_class] roc_for_data[i]['general_classification'] = roc_curve(targets, predictions_for_class, pos_label=i) return [roc_for_data], ['roc_curve']
def make_roc(name, clf, ytest, xtest, ax=None, labe=5, proba=True, skip=0): initial=False if not ax: ax=plt.gca() initial=True if proba: fpr, tpr, thresholds=roc_curve(ytest, clf.predict_proba(xtest)[:,1]) else: fpr, tpr, thresholds=roc_curve(ytest, clf.decision_function(xtest)) roc_auc = auc(fpr, tpr) if skip: l=fpr.shape[0] ax.plot(fpr[0:l:skip], tpr[0:l:skip], '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc)) else: ax.plot(fpr, tpr, '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc)) label_kwargs = {} label_kwargs['bbox'] = dict( boxstyle='round,pad=0.3', alpha=0.2, ) for k in range(0, fpr.shape[0],labe): #from https://gist.github.com/podshumok/c1d1c9394335d86255b8 threshold = str(np.round(thresholds[k], 2)) ax.annotate(threshold, (fpr[k], tpr[k]), **label_kwargs) if initial: ax.plot([0, 1], [0, 1], 'k--') ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.05]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_title('ROC') ax.legend(loc="lower right") return ax
def plotROCCurve(y_test,y_score,fileStorePath,fileName): #print("Inside Plot ROC curve") fpr = dict() tpr = dict() roc_auc = dict() for i in range(1): fpr[i], tpr[i], _ = roc_curve(y_test[:], y_score[:]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) plt.figure() plt.plot(fpr[0], tpr[0], label='ROC curve (area = %0.2f)' % roc_auc[0]) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") #plt.show() if not os.path.exists(fileStorePath): os.makedirs(fileStorePath) plt.savefig(fileStorePath+'/'+'ROC'+fileName+'.png') plt.clf()#clear the figure for next loop
def write_score(name, gold_labels, pred_scores, classes, average_classes): classes, average_classes = np.array(classes), np.array(average_classes) gold_scores = LabelBinarizer().fit(classes).transform(gold_labels) pred_labels = classes[np.argmax(pred_scores, axis=1)] with closing(Tee('{}.txt'.format(name), 'w')): precision, recall, fscore, _ = precision_recall_fscore_support(gold_labels, pred_labels, labels=classes) for t in zip(classes, precision, recall, fscore): print('{}: P={:.2f}, R={:.2f}, F1={:.2f}'.format(*t)) print('Accuracy: {:.4f}'.format(accuracy_score(gold_labels, pred_labels))) print('F1 average: {:.4f}'.format(np.mean(fscore[LabelEncoder().fit(classes).transform(average_classes)]))) with PdfPages('{}.pdf'.format(name)) as pdf: fpr = {} tpr = {} roc_auc = {} for i in range(len(classes)): fpr[i], tpr[i], _ = roc_curve(gold_scores[:, i], pred_scores[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr['micro'], tpr['micro'], _ = roc_curve(gold_scores.ravel(), pred_scores.ravel()) roc_auc['micro'] = auc(fpr['micro'], tpr['micro']) plt.figure() plt.plot(fpr['micro'], tpr['micro'], label='micro-average (area = {:.2f})'.format(roc_auc['micro'])) for i in range(len(classes)): plt.plot(fpr[i], tpr[i], label='{0} (area = {1:.2f})'.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend(loc='lower right') pdf.savefig()
def plotAUC(yhatNN, yhatRF, yhatET, y): """ Plot ROC curve. Compare Neural Net to Random Forests. :param yhatNN: Neural Net predictions :param yhatRF: Random Forest predictions :param yhatET: Extra-Trees predictions :param y: target labels :return: None """ fprNN, tprNN, thresholdsNN = metrics.roc_curve(y, yhatNN) fprRF, tprRF, thresholdsRF = metrics.roc_curve(y, yhatRF) fprET, tprET, thresholdsET = metrics.roc_curve(y, yhatET) plt.figure() plt.plot(fprNN, tprNN, label='Neural Net') plt.plot(fprRF, tprRF, label='Random Forest') plt.plot(fprET, tprET, label='Extra-Trees') plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") plt.savefig('ROC.png') plt.close()
def multiclass_AUC(clf, X, Y): # Binarize the output X, Y = np.array(X), np.array(Y) Y = label_binarize(Y, classes=list(set(Y))) n_classes = Y.shape[1] # shuffle and split training and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5, random_state=0) # Learn to predict each class against the other classifier = OneVsRestClassifier(clf) Y_score = classifier.fit(X_train, Y_train).predict(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) print "AUC for multiclass {}: {}".format(clf.__class__.__name__, roc_auc["micro"])
def main(): '''Sorted data''' inputsorted='german-sorted.xlsx' datasorted=readxlsx(inputsorted) score_sorted=datasorted[0,:] act_class_sorted=datasorted[1,:] '''calculating ROC AUC''' fpr_sorted,tpr_sorted,thresholds_sorted=metrics.roc_curve(act_class_sorted,score_sorted) aucvalue_sorted=metrics.auc(fpr_sorted,tpr_sorted) print 'AUC value of sorted data' print aucvalue_sorted #print 'Threshold' #print thresholds_sorted print '' '''Unsorted data''' inputunsorted='german-unsorted.xlsx' dataunsorted=readxlsx(inputunsorted) score_unsorted=dataunsorted[0,:] act_class_unsorted=dataunsorted[1,:] '''calculating ROC AUC''' fpr_unsorted,tpr_unsorted,thresholds_unsorted=metrics.roc_curve(act_class_unsorted,score_unsorted) aucvalue_unsorted=metrics.auc(fpr_unsorted,tpr_unsorted) print 'AUC value of sorted data' print aucvalue_unsorted
def calculate_draw_roc(classifier, data, features, label, cv_Flod, original_data, original_label): mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] my_test = original_data[:3000] my_label = original_label[:3000] features_importance = dict() for i, (train, test) in enumerate(cv_Flod): fitted_classifier = classifier.fit(data[train], label[train]) probas_ = fitted_classifier.predict_proba(data[test]) if i == 1: save_result(probas_, "predict_result.csv") save_result(label[test], "original_result.csv") # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(label[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) importances = fitted_classifier.feature_importances_ indices = np.argsort(importances)[::-1] print("Feature ranking: ") for f in range(data.shape[1]): print("%s. %d (%f)" % (features[indices[f]], indices[f], importances[indices[f]])) features_importance[features[indices[f]]] = importances[indices[f]] test_probs = fitted_classifier.predict_proba(my_test) test_fpr, test_tpr, test_thresholds = roc_curve(my_label, test_probs[:, 1]) roc_auc = auc(test_fpr, test_tpr) plt.plot(test_fpr, test_tpr, lw=1, label='ROC test (area = %0.2f)' % (roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv_Flod) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.savefig("ROC_GB_user_all_solved_lr(0.05).png") return features_importance
def train(train_data, test_data, fold): x_train = train_data[:, :-1] y_train = train_data[:, -1:] x_test = test_data[:, :-1] y_test = test_data[:, -1:] theta_vector = np.zeros(shape=(len(x_train[0]), 1)) theta_vector = np.matrix(theta_vector) # update gradient descent each time m = len(x_train[0]) learning_curve = {} learning_errors = {} learning_accuracy = {} ll_count = 1 sigmod = lambda z: 1.0 / (1.0 + np.exp(-z)) while ll_count < 2: learning_rate = 10**(-ll_count) alpha_by_m = learning_rate / m old_theta = theta_vector j = {} for i in range(0, 1000): h_x = np.dot(x_train, old_theta) diff_theta = np.dot(x_train.T, (sigmod(h_x) - y_train)) n_theta = learning_rate * diff_theta old_theta = old_theta - n_theta #j[iterator] = compute_error(h_x, y_train, y_train, y_train) solution_vector = old_theta # #calculate mean squared error train_error = compute_error(sigmod(h_x), y_train) test_error = compute_error((sigmod(x_test.dot(solution_vector))), y_test) tpr, fpr, train_accuracy = compute_accuracy((sigmod(x_train.dot(solution_vector))), y_train) tpr, fpr, test_accuracy = compute_accuracy((sigmod(x_test.dot(solution_vector))), y_test) learning_accuracy[(train_accuracy, test_accuracy)] = ll_count learning_errors[(train_error, test_error)] = ll_count learning_curve[ll_count] = j ll_count += 1 if fold is 4: roc_curve(y_train, (sigmod(x_train.dot(solution_vector)))) # pl.plot(np.arange(iterator), # learning_curve[learning_erros.get(min(learning_erros.keys()))].values()) # pl.xlabel('Iterations') # pl.ylabel('Cost Function') #pl.show() print "Best Learning rate is ", 10**(- learning_errors.get(min(learning_errors.keys()))), \ "with mse values:", min(learning_errors.keys()) print "Accuracy is ", max(learning_accuracy.keys()) return min(learning_errors.keys()), max(learning_accuracy.keys())
def getROC(clf,probstype): if probstype == 1: probs = clf.predict_proba(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs[:,1]) else: probs = clf.decision_function(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs) return fpr,tpr
def explain(self, param, label='', auc_plot=False): print('------------ Explanation -------------') self._file.write('------------ Explanation -------------\n') phi = param[0] theta = param[1] psi = param[2] k = param[3] start1 = time() ex = Extractor(self._clf, phi, theta, psi) ex.extract_forest_paths() ex.rule_filter() print('max_rule', ex.max_rule, 'max_node', ex.max_node) print('min_rule', ex.min_rule, 'min_node', ex.min_node) end1 = time() print("EX Running time: %s seconds" % (end1 - start1)) print("original path number: ", ex.n_original_leaves_num) print("original scale: ", ex.scale) print("path number after rule filter: ", len(ex._forest_values)) self._file.write('original path number: {}\n'.format( ex.n_original_leaves_num)) self._file.write('original scale: {}\n'.format(ex.scale)) self._file.write('path number after rule filter: {}\n'.format( len(ex._forest_values))) start2 = time() sat = Z3Process(ex, k) sat.leaves_partition() if self._maxsat_on is True: sat.maxsat() print("path number after maxsat: ", sat.n_rules_after_max, " after filter: ", sat.n_rules_after_filter, '\n') self._file.write( 'path number after maxsat: {}\tafter filter: {}\n\nclasses:\t{}\n\n' .format(sat.n_rules_after_max, sat.n_rules_after_filter, self._clf.classes_)) else: print('no maxsat') self._file.write('/no MAX-SAT\n') sat.run_filter() end2 = time() print("SAT Running time: %s seconds" % (end2 - start2)) print('classes:', self._clf.classes_) start3 = time() f = FormulaeEstimator(sat, conjunction=self._conjunction, classes=self._clf.classes_) f.get_formulae_text(self._file) print('\n------------ Performance -------------') self._file.write('\n------------ Performance -------------\n') c_ans = self._clf.predict(self._X_test) ans = f.classify_samples(self._X_test) end3 = time() print("ET Running time: %s seconds" % (end3 - start3)) RF_accuracy = accuracy_score(self._y_test, c_ans) EX_accuracy = accuracy_score(self._y_test, ans) performance = accuracy_score(c_ans, ans) no_ans = 0 overlap = 0 for each in f.sat_group: if len(each) > 1: overlap += 1 elif len(each) == 0: no_ans += 1 if label == '': # 计算AUC label = self._clf.classes_[0] fpr, tpr, thresholds = roc_curve(self._y_test, self._clf.predict_proba( self._X_test)[:, 1], pos_label=label) ori_auc = auc(fpr, tpr) ex_test = f.classify_samples_values(self._X_test) efpr, etpr, ethresholds = roc_curve(self._y_test, ex_test[:, 1], pos_label=label) ex_auc = auc(efpr, etpr) print('sample size:\t', len(self._y_test)) self._file.write('sample size:\t{}\n'.format(len(self._y_test))) print('RF accuracy:\t', RF_accuracy) self._file.write('RF accuracy:\t{}\n'.format(RF_accuracy)) print('RF AUC:\t\t\t', ori_auc) self._file.write('RF AUC:\t\t\t{:.2f}\n'.format(ori_auc)) # print('错误结果覆盖:', f_count) print('EX accuracy:\t', EX_accuracy) self._file.write('EX accuracy:\t{}\n'.format(EX_accuracy)) print('EX AUC:\t\t\t', ex_auc) self._file.write('EX AUC:\t\t\t{:.2f}\n'.format(ex_auc)) print('Coverage:\t\t', (len(self._y_test) - no_ans) / len(self._y_test)) self._file.write('Coverage:\t\t{}\n'.format( (len(self._y_test) - no_ans) / len(self._y_test))) print('Overlap:\t\t', overlap / len(self._y_test)) self._file.write('Overlap:\t\t{}\n'.format(overlap / len(self._y_test))) print('*Performance:\t', performance) self._file.write('*Performance:\t{}\n'.format(performance)) if auc_plot is True: plt.plot(fpr, tpr, linewidth=2, label="RF ROC curve (area = {:.2f})".format(ori_auc)) plt.plot(efpr, etpr, linewidth=2, label="Explain ROC curve (area = {:.2f})".format(ex_auc)) plt.xlabel("false positive rate") plt.ylabel("true positive rate") plt.ylim(0, 1.05) plt.xlim(0, 1.05) plt.legend(loc=4) # 图例的位置 plt.show()
print( classification_report(y_true=y_test, y_pred=y_pred_rnd, target_names=['normal', 'covid'])) fig1 = plt.figure() sns.heatmap(data=cm, cmap='Blues', annot=True, annot_kws={'size': 14}, fmt='d', vmin=0, vmax=len(y_test) / 2.) plt.title('annotated heatmap for confusion matrix') plt.show() # fig1.savefig('./checkpoints/densenet121/cm_heatmap.png') fpr, tpr, _ = roc_curve(y_true=y_test, y_score=y_pred, pos_label=None) roc_auc = auc(x=fpr, y=tpr) fig2 = plt.figure() plt.plot(fpr, tpr, 'b', label='AUC = %0.4f' % roc_auc) plt.title('Receiver Operating Characteristic') plt.legend() plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() # fig2.savefig('./checkpoints/densenet121/roc.png')
true_class_rev = 1 - true_class total_test_images = len(filenames) pred_prob = model_use.predict_generator(data_test, steps=total_test_images, verbose=1) pred_prob_rev = 1 - pred_prob # Prediction probability for each image being a dog results_dict = dict(zip(filenames, pred_prob_rev)) # Confusion matrix conf_mat = confusion_matrix(true_class, pred_prob > 0.5) print(conf_mat) # Get ROC AUC score and arrays for building ROC curve fpr, tpr, thresholds = roc_curve(true_class, pred_prob) auc = roc_auc_score(true_class, pred_prob > 0.5) # ROC curve plt.figure() plt.plot(np.linspace(0, 1, num=50), '--', color='gray') plt.plot(fpr, tpr, '-', color='red') plt.xlabel('FPR', fontsize=14) plt.ylabel('TPR', fontsize=14) plt.show() # Get a random image in the test data and display it with it's associated # predicted probability. def rand_check(index=None): # Get random image if no index is supplied
graphviz.Source(dot_graph).view() ########################################################################## ########################################################################## # Finally, let’s evaluate the tree’s performance on the test data. The predict() function can be used for # this purpose. We can then build a confusion matrix # 86+59/200 = 0.725 ########################################################################## ############ Here we construct the ROC curve for the tree ################ ########################################################################## y_score = clf.predict_proba(X_test) fpr, tpr, _ = roc_curve(y_test, y_score[:, 1]) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='orange', label='ROC curve (area = {:0.2f})'.format(roc_auc)) plt.plot([0, 1], [0, 1], color='blue', linestyle='--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve for our Decision Tree') plt.legend(loc="lower right") ##########################################################################
allLTPC += 1 if true_label == pred_label: corrctLTPC += 1 elif true_label == 1: allTPC += 1 if true_label == pred_label: corrctTPC += 1 acc_lst.append(acc) class2_acc_lst.append( [corrctLTPC / float(allLTPC), corrctTPC / float(allTPC)]) # auc roc true_class = np.array(test_label_set) # true_class为数据的真实标签 pred_scores = np.array([a[0] for a in result1]) # scores为分类其预测的得分 fpr, tpr, thresholds = metrics.roc_curve(true_class, pred_scores, pos_label=0) # bcc AUC = auc(fpr, tpr) # tpr fpr yuedeng = [] for i in range(len(fpr)): yuedeng.append(tpr[i] - fpr[i]) yuedeng_index = yuedeng.index(max(yuedeng)) # print 'the best TPR FPR in subset-%d'%testIndex, tpr[yuedeng_index], fpr[yuedeng_index] auc_lst.append(AUC) trueAllLst += test_label_set scoreAllLst += [a[0] for a in result1] true_class = np.array(test_label_set) # true_class为数据的真实标签 pred_scores = np.array([a[1] for a in result1]) # scores为分类其预测的得分
r'D:\Users\zcguo\PycharmProjects\credit_score\data\test.csv') test_X = test_data.iloc[:, 2:] test_y = test_data.iloc[:, 1] test_X = trans_woe(test_X, x1_name, x1_woe, x1_cut) test_X = trans_woe(test_X, x2_name, x2_woe, x2_cut) test_X = trans_woe(test_X, x3_name, x3_woe, x3_cut) test_X = trans_woe(test_X, x7_name, x7_woe, x7_cut) test_X = trans_woe(test_X, x9_name, x9_woe, x9_cut) test_X = test_X.iloc[:, -5:] # gbdt model roc X3 = sm.add_constant(test_X) resuG = gbm.predict(X3) recall1 = metrics.recall_score(test_y, resuG.round()) acc1 = metrics.accuracy_score(test_y, resuG.round()) print(recall1) print(acc1) fpr1, tpr1, threshold1 = metrics.roc_curve(test_y, resuG) rocauc1 = metrics.auc(fpr1, tpr1) plt.plot(fpr1, tpr1, 'b', label='AUC = %0.2f' % rocauc1) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('TPR') plt.xlabel('FPR') plt.show()
def cal_auc_ks_iv(df, targets=[0, 1, 3, 7, 14, 30], text='', max_depth=2, plot=True, precision=3): ''' 计算 AUC KS 和 IV的值 并画出对应的AUC图 ''' ks = pd.DataFrame() ac = pd.DataFrame() iv = pd.DataFrame() dn = [f'{n}d' for n in targets] cols = set(df.columns) - set(dn) for n in targets: auc_value = [] ks_value = [] iv_value = [] plt.figure(figsize=(6,4), dpi=100) for var in cols: y_true = df[df[var].notnull()][f'{n}d'] y_pred = df[df[var].notnull()][var] # 计算各个指标的 fpr tpr 和 thr fpr, tpr, thr = roc_curve(y_true, y_pred, pos_label=1) # 计算AUC值 ac_single = auc(fpr, tpr) if ac_single < 0.5: fpr, tpr, thr = roc_curve(y_true, -y_pred, pos_label=1) ac_single = auc(fpr, tpr) auc_value.append(ac_single) # 计算K-S值 ks_single = (tpr - fpr).max() ks_value.append(ks_single) # 计算IV值 iv_single = cal_woe_iv(y_pred, y_true, max_depth=max_depth)[1] iv_value.append(iv_single) if plot: # ROC Cureve plt.plot(fpr, tpr, lw=1, label=f'{var}(auc=' + str(round(ac_single, precision)) + ')') plt.plot(fpr, tpr, lw=1) # Labels plt.grid() plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6)) plt.plot([0, 0, 1], [0, 1, 1], lw=1, linestyle=':', color='black') plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.title(f'{text}ROC for {n}d') plt.legend(loc='best') auc_part = pd.DataFrame(auc_value, columns=[f'{n}d'], index=cols) ac = pd.concat([ac, auc_part], axis=1) ks_part = pd.DataFrame(ks_value, columns=[f'{n}d'], index=cols) ks = pd.concat([ks, ks_part], axis=1) iv_part = pd.DataFrame(iv_value, columns=[f'{n}d'], index=cols) iv = pd.concat([iv, iv_part], axis=1) iv = np.round(iv, precision) ac = np.round(ac, precision) ks = np.round(ks, precision) return ac, ks, iv
min_samples_leaf=1, ##叶节点所需的最小样本数 如果是浮点数代表是百分比 max_features=None, ##在寻找最佳分割点要考虑的特征数量auto全选/sqrt开方/log2对数/None全选/int自定义几个/float百分比 max_leaf_nodes=None, ##叶节点的数量 None不限数量 min_impurity_decrease=1e-7, ##停止分裂叶子节点的阈值 verbose=0, ##打印输出 大于1打印每棵树的进度和性能 warm_start=False, ##True在前面基础上增量训练(重设参数减少训练次数) False默认擦除重新训练 random_state=0 ##随机种子-方便重现 )##多类别回归建议使用随机森林 model.fit(X_train,y_train) y_pred=model.predict(X_test) grd_enc = OneHotEncoder() grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000) model.fit(X_train, y_train) grd_enc.fit(model.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(model.apply(X_test)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(model.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) # submission["Survived"]=y_pred # submission.to_csv(pre_path+'GBDT_LR/GradientBoostingClassifier.csv',index=None) # accuracy_score=metrics.accuracy_score(y_pred,y_test) # print("GradientBoostingClassifier : ",accuracy_score) print(submission.head(100))
if m_el_isprompt == 0: histo_tmva_bkg.Fill(bdtOutput) elif m_el_isprompt == 1: histo_tmva_sig.Fill(bdtOutput) else: print "Warning: m_mu_isprompt is not 0 or 1!!!" file.Close() X_test = np.array(_X_test) y_test = np.array(_y_test) # sklearn tpr and tpr sk_y_predicted = bdt.predict_proba(X_test)[:, 1] fpr, tpr, _ = roc_curve(y_test, sk_y_predicted) sig_eff = array.array('f', [rate for rate in tpr]) bkg_rej = array.array('f', [(1 - rate) for rate in fpr]) # roc_curve_sk() - skTMVA version of roc_curve from mva_tools.build_roc_simple import roc_curve_sk fpr_comp, tpr_comp, _ = roc_curve_sk(y_test, sk_y_predicted) sig_eff_comp = array.array('f', [rate for rate in tpr_comp]) bkg_rej_comp = array.array('f', [(1 - rate) for rate in fpr_comp]) # Stack for keeping plots plots = [] # Getting ROC-curve for skTMVA
def train_5_cross(df_pre, X,y, X_test_v1,y_test_v1, thresholds=0.45, id_1='id', csv_name=0): """ 功能: 五折训练并输出名单 why: 5折一般是效果比较稳定的,用于线下做的。 X: 训练数据X(无标签/df型) y: 训练数据y(标签/df型) X_test_v1: 预测数据X(无标签/df型) y_test_v1: 预测数据y(无标签/df型) thresholds: 阈值选择,默认0.45高精确率 csv_name: 保存csv的名称,默认不保存 returen: 客户名单及情况 """ vali_auc_num=0 # 验证集AUC vali_recall_num=0 # 验证集召回率 vali_precision_num=0 # 验证集精确率 test_auc_num=0 # 预测集AUC test_recall_num=0 # 预测集召回率 test_precision_num=0 # 预测集精确率 y_pred_input = np.zeros(len(X_test_v1)) # 相应大小的零矩阵 print("=============开始训练================") folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234) # 分层采样, n_splits为几折 for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)): print("第 {} 次训练...".format(fold_+1)) train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx] vali_x, vali_y = X.loc[val_idx], y.loc[val_idx] # 以下为调过参的lgb模型 clf = lgb.LGBMClassifier(max_depth=20, min_data_in_bin=5, max_bin=200, min_child_samples=90, num_leaves=20, n_estimators=20000, objective='binary', boosting_type='gbdt', learning_rate=0.02, lambda_l2=5) clf.fit(train_x, trai_y, eval_set=[(train_x, trai_y), (vali_x, vali_y)], verbose=0, early_stopping_rounds=100, eval_metric='f1') # 不懂的去GitHub看搜LightGBM的参数解释 # ===============验证集AUC操作=================== y_prb = clf.predict_proba(vali_x)[:,1] # 获取预测概率 # fpr:在实际为正的样本中,被正确判断为正的比例。tpr:在实际为负的样本中,被正确判断为负的比例。thres为阈值 fpr, tpr, thres = roc_curve(vali_y, y_prb) vali_roc_auc = auc(fpr, tpr) # 获取验证集auc vali_auc_num += vali_roc_auc # 将本次auc加入总值里 print("vali auc = {0:.4}".format(vali_roc_auc)) # 本次auc的值 # ===============预测集AUC操作=================== y_prb_test = clf.predict_proba(X_test_v1)[:,1] # 获取预测概率 fpr, tpr, thres = roc_curve(y_test_v1, y_prb_test) test_roc_auc = auc(fpr, tpr) test_auc_num += test_roc_auc print("test auc = {0:.4}".format(test_roc_auc)) # ===============验证metric操作=================== y_pre_proba = clf.predict_proba(vali_x.values) y_predictions = y_pre_proba[:, 1]>thresholds # 取阈值多少以上的为True cnf_matrix = confusion_matrix(vali_y, y_predictions) # 建立矩阵 np.set_printoptions(precision=2) # 控制在两位数 vali_recall = '{0:.3f}'.format(cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) # 召回率 vali_precision = '{0:.3f}'.format(cnf_matrix[1,1]/(cnf_matrix[0,1]+cnf_matrix[1,1])) # 精确率 print("vali_metric: ", vali_recall, vali_precision) vali_recall_num += float(vali_recall) # 将本次召回率加入总值里 vali_precision_num += float(vali_precision) # 将本次精确率加入总值里 # ===============预测metric操作=================== y_pre_proba_test = clf.predict_proba(X_test_v1.values) y_predictions_test = y_pre_proba_test[:, 1]>thresholds # 取阈值多少以上的为True cnf_matrix_test = confusion_matrix(y_test_v1, y_predictions_test) # 建立矩阵 np.set_printoptions(precision=2) # 控制在两位数 test_recall = '{0:.3f}'.format(cnf_matrix_test[1,1]/(cnf_matrix_test[1,0]+cnf_matrix_test[1,1])) # 召回率 test_precision = '{0:.3f}'.format(cnf_matrix_test[1,1]/(cnf_matrix_test[0,1]+cnf_matrix_test[1,1])) # 精确率 print("test_metric: ", test_recall, test_precision) test_recall_num += float(test_recall) # 将本次召回率加入总值里 test_precision_num += float(test_precision) # 将本次精确率加入总值里 y_pred_input += y_pre_proba_test[:, 1] # 将每次的预测的结果写入数组中 print("5折泛化,验证集AUC:{0:.3f}".format(vali_auc_num/5)) # 前面是做了5次相加,所以这次要除以5 print("5折泛化,预测集AUC:{0:.3f}".format(test_auc_num/5)) print("5折泛化,验证集recall:{0:.3f}".format(vali_recall_num/5)) print("5折泛化,验证集precision:{0:.3f}".format(vali_recall_num/5)) print("5折泛化,预测集recall:{0:.3f}".format(test_recall_num/5)) print("5折泛化,预测集precision:{0:.3f}".format(test_recall_num/5)) print("================开始输出名单==================") y_pred_input_end = y_pred_input / 5 # 前面是做了5次相加,所以这次要除以5 y_pred_input_precision = y_pred_input_end > thresholds # 获取高精确率的标签 submission = pd.DataFrame({"id": df_pre[id_1], "概率": y_pred_input_end, "高精确": y_pred_input_precision}) if csv_name != 0: submission.to_csv("%s预测名单.csv" % csv_name, index=False) # 保存 print("================输出名单名单==================") print(submission.head(5))
def simple_CV_evaluation(model, X, y, k, random_state): my_pred = [] my_true = [] auPRC_list = [] auROC_list = [] auROC_x_list = [] auROC_y_list = [] auPRC_x_list = [] auPRC_y_list = [] for train_index, test_index in random_sample_balance(y, random_state): # print ("train size: %s test size: %s"%(len(train_index),len(test_index))) # print ("train_index:",np.random.choice(train_index,3)) # print ("test_index:",np.random.choice(test_index,3)) # print ("total",len(train_index)+len(test_index)) X_train, X_test = X.loc[train_index], X.loc[test_index] y_train, y_test = y.loc[train_index], y.loc[test_index] if "RF" in k: current_model = dp(model) current_model.fit(X_train, y_train) pred_y = current_model.predict_proba(X_test) pred_y = [x[1] for x in pred_y] elif k == "LS-GKM": pos_train_file, neg_train_file, pos_test_file, neg_test_file, addon_string = get_fasta_file( X_train, y_train, X_test, y_test) pred_y = gkm_SVM_fit_transform(pos_train_file, neg_train_file, pos_test_file, neg_test_file) pred_y = pred_y.loc[y_test.index.tolist()]['pred'].tolist() os.system("rm %s*" % (addon_string)) else: pred_y = X_test.tolist() y_test = y_test.tolist() my_pred += pred_y my_true += y_test try: auROC = roc_auc_score(y_test, pred_y) auPRC = average_precision_score(y_test, pred_y) # print ("model %s auPRC: %s. auROC: %s"%(k,auPRC,auROC)) auPRC_list.append(auPRC) auROC_list.append(auROC) x_predict, y_predict, _ = roc_curve(y_test, pred_y) x_predict, y_predict = piecewise_roc(x_predict, y_predict) auROC_x_list += list(x_predict) auROC_y_list += list(y_predict) y_predict, x_predict, _ = precision_recall_curve(y_test, pred_y) x_predict, y_predict = piecewise_prc(x_predict, y_predict) auPRC_x_list += list(x_predict) auPRC_y_list += list(y_predict) except: print("y_test pos :", y_test[y_test == 1].shape) print("y_test neg :", y_test[y_test == 0].shape) print("y_train pos :", y_train[y_train == 1].shape) print("y_train neg :", y_train[y_train == 0].shape) pass df = pd.DataFrame() df['true'] = my_true df['pred'] = my_pred df['label'] = k return df, auROC_list, auPRC_list, [auROC_x_list, auROC_y_list ], [auPRC_x_list, auPRC_y_list]
def metrics(X, Y, A, B, N): incorrect = 0 true_pos = 0 false_pos = 0 true_neg = 0 false_neg = 0 y_true = [] y_pred = [] i = 0 for x in X: prediction = np.argmax(stable_softmax(x, A, B)) true_label = np.argmax(Y[i]) y_true.append(true_label) y_pred.append(prediction) if prediction != true_label: incorrect += 1 if prediction == 1 and true_label == 1: true_pos += 1 if prediction == 1 and true_label == 0: false_pos += 1 if prediction == 0 and true_label == 0: true_neg += 1 if prediction == 0 and true_label == 1: false_neg += 1 i += 1 print("confusion matrix: ") print("[ ", true_neg, false_pos, " ]") print("[ ", false_neg, true_pos, " ]") y_true = np.array(y_true) y_pred = np.array(y_pred) # Compute fpr, tpr, thresholds and roc auc fpr, tpr, thresholds = roc_curve(y_true, y_pred) roc_auc = auc(fpr, tpr) print("AUC score: ", roc_auc) if true_pos == 0 and false_pos == 0: print("WARNING::True pos and False pos both zero") precision = true_pos / 0.000001 recall = true_pos / 0.000001 F1 = 2 * ((precision * recall) / (precision + recall)) classification_error = incorrect / N else: precision = true_pos / (true_pos + false_pos) # true pos rate (TRP) recall = true_pos / (true_pos + false_neg) # F1 = 2 * ((precision * recall) / (precision + recall)) classification_error = incorrect / N print() return classification_error, precision, recall, F1, roc_auc, fpr, tpr
for bestK in lbestK: # call([mathematica, '-script', str(predictPhenotype), str(dirShared),str(bestK) ]) fphenotypes=matchNamePattern(patternPhenotypes) dfphenotypes=pd.read_csv(fphenotypes, skiprows=1, header=None) dfpredicted=pd.read_csv(fpredicted, header=None) roc_auc=metrics.roc_auc_score(dfphenotypes, dfpredicted) fpr, tpr, thresholds=metrics.roc_curve(dfphenotypes, dfpredicted) # Plot ROC curve plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') # random predictions curve plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate or (1 - Specifity)') plt.ylabel('True Positive Rate or (Sensitivity)') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") print("k: ", bestK) print("auc =", roc_auc)
label=u'预测值,$R^2$=%.3f' % lr2.score(X1_train, Y1_train)) plt.legend(loc='upper left') plt.xlabel(u'数据编号', fontsize=18) plt.ylabel(u'葡萄酒质量', fontsize=18) plt.title(u'葡萄酒质量预测统计(降维处理)', fontsize=20) plt.show() ### 从auc角度看效果===>效果不错 from sklearn.preprocessing import label_binarize from sklearn import metrics y_test_hot = label_binarize(Y_test, classes=(3, 4, 5, 6, 7, 8, 9)).ravel() ### 计算原始数据模型 ## 得到预测的损失值 lr_y_score = lr.decision_function(X_test).ravel() ## 计算roc的值 lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot, lr_y_score) ## 计算auc的值 lr_auc = metrics.auc(lr_fpr, lr_tpr) ## 计算降维后的数据模型 lr2_y_score = lr2.decision_function(X1_test).ravel() ## 计算roc的值 lr2_fpr, lr2_tpr, lr2_threasholds = metrics.roc_curve(y_test_hot, lr2_y_score) ## 计算auc的值 lr2_auc = metrics.auc(lr2_fpr, lr2_tpr) print("原始数据AUC值:", lr_auc) print("降维数据AUC值:", lr2_auc)
# plt.show() # np.argmax() : 최댓값의 첫번째 인덱스 반환 # threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)] # 훈련 세트에 대한 예측 만듬 # y_train_pred_90 = (y_scores >= threshold_90_precision) # 0.9000345901072293 # 0.4799852425751706 # print(precision_score(y_train_5, y_train_pred_90)) # print(recall_score(y_train_5, y_train_pred_90)) # ROC 곡선 # fpr: 거짓 양성 비율 / tpr: 진짜 양성 비율 fpr, tpr, thresholds = roc_curve(y_train_5, y_scores) def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0, 1], [0, 1], 'k--') # ROC곡선: 민감도(재현율)에 대한 1-특이도 그래프 # plot_roc_curve(fpr, tpr) # plt.show() # 0.9604938554008616 # print(roc_auc_score(y_train_5, y_scores)) forest_clf = RandomForestClassifier(random_state=42) y_probas_forest = cross_val_predict(forest_clf,
#%% #X_train : original , prediction_tn, non_need = ml_algorithms(X_train, y_train, X_test, y_test) print(y_test.value_counts() / y_test.shape[0]) #data: embeddings #ml_algorithms(data,y_train,data_test,y_test) #X_train2: concatnate = original + embeddings''' prediction_tp, prediction_tpol_prob = ml_algorithms(X_train2, y_train, X_test2, y_test) #%% from sklearn.metrics import roc_curve, auc fpr_pol, tpr_pol, _ = roc_curve((y_test == True).apply(int), prediction_tpol_prob[:, 1]) fpr, tpr, _ = roc_curve((y_test == True).apply(int), prediction_tp[:, 1]) fprn, tprn, _ = roc_curve((y_test == True).apply(int), prediction_tn[:, 1]) print('AUC for Node2Vec Logistic + Poly features + Normal Features : ', auc(fpr_pol, tpr_pol)) print('AUC for Node2Vec Logistic + Linear Features + Normal Features : ', auc(fpr, tpr)) print('AUC for Normal Features LogisticNormal Features Logistic : ', auc(fprn, tprn)) plt.plot(fpr_pol, tpr_pol, 'g', label='Node2Vec Logistic + Poly features + Normal Features')
def evaluate(self): print("Evaluating") na = ' ' for task in self._tasks: print(task) for modelIndex, currModel in enumerate(self.model[task]): if not os.path.isfile( self._calculateFileEvaluation( task, modelIndex)) or not os.path.isfile( self._calculateFilePrediction( task, modelIndex)) or not os.path.isfile( self._calculateFileTable( task, modelIndex)): table = [[ "task", "average", "MAPs", "MAPc", "accur.", "kappa", "prec.", "recall", "f1score" ]] table.append([" ", " ", " ", " ", " ", " ", " ", " "]) prediction = {} yp = currModel.predict_proba(self.XTest[task]) yt = self.yTest[task] prediction['yp'] = yp prediction['yt'] = yt ytn = self.lb[task].inverse_transform(yt) yc = np.zeros(yt.shape, np.int) for i, p in enumerate(yp): yc[i][np.argmax(p)] = 1 ycn = self.lb[task].inverse_transform(yc) metrics = {} metrics['MAPs'] = MAPScorer().samplesScore(yt, yp) metrics['MAPc'] = MAPScorer().classesScore(yt, yp) metrics['accuracy'] = accuracy_score(yt, yc) metrics['kappa'] = cohen_kappa_score(ytn, ycn) metrics['precision'] = {} metrics['recall'] = {} metrics['f1score'] = {} table.append([ task, na, "{:.3f}".format(metrics['MAPs']), "{:.3f}".format(metrics['MAPc']), "{:.3f}".format(metrics['accuracy']), "{:.3f}".format(metrics['kappa']), na, na, na ]) for avg in ['micro', 'macro', 'weighted']: metrics['precision'][avg], metrics['recall'][ avg], metrics['f1score'][ avg], _ = precision_recall_fscore_support( yt, yc, average=avg) table.append([ task, avg, na, na, na, na, "{:.3f}".format(metrics['precision'][avg]), "{:.3f}".format(metrics['recall'][avg]), "{:.3f}".format(metrics['f1score'][avg]) ]) metrics['pr-curve'] = {} metrics['pr-curve']['x'], metrics['pr-curve'][ 'y'], metrics['pr-curve'][ 'auc'] = self._calculateMicroMacroCurve( lambda y, s: (lambda t: (t[1], t[0])) (precision_recall_curve(y, s)), yt, yp) metrics['roc-curve'] = {} metrics['roc-curve']['x'], metrics['roc-curve'][ 'y'], metrics['roc-curve'][ 'auc'] = self._calculateMicroMacroCurve( lambda y, s: (lambda t: (t[0], t[1])) (roc_curve(y, s)), yt, yp) pickle.dump( metrics, open(self._calculateFileEvaluation(task, modelIndex), "wb")) pickle.dump( prediction, open(self._calculateFilePrediction(task, modelIndex), "wb")) tableString = tabulate(table) print(tableString) with open(self._calculateFileTable(task, modelIndex), "w") as fid: fid.write(tableString + "\n")
tree_small = rf.estimators_[5] export_graphviz(tree_small, out_file='rfsampletree.dot', feature_names=feature_list, rounded=True, precision=1) (graph, ) = pydot.graph_from_dot_file('rfsampletree.dot') graph.write_png('rfsampletree.png') # Making ROC curve from sklearn.metrics import roc_curve, auc Y_score = rf.predict_proba(test_features)[:, 1] fpr = dict() tpr = dict() fpr, tpr, _ = roc_curve(test_labels, Y_score) roc_auc = dict() roc_auc = auc(fpr, tpr) # make the plot plt.figure(figsize=(10, 10)) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.grid(True) plt.plot(fpr, tpr, label='AUC = {0}'.format(roc_auc)) plt.legend(loc="lower right", shadow=True, fancybox=True) plt.show()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=40, epochs=epochs, validation_split=0.25, verbose=1, callbacks=[tensorboard]) # Prediction and ROC/ AUC curve plotting y_pred = model.predict(x_test) fpr_keras, tpr_keras, thresholds_keras = roc_curve(np.ravel(y_test), np.ravel(y_pred)) auc_keras = auc(fpr_keras, tpr_keras) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() test_loss, test_acc = model.evaluate(x_test, y_test, batch_size=batch_size) model.save("CNN.h5")
#threshold curve fig = plt.figure(figsize=(10, 7)) fig.subplots_adjust(hspace=0.35) fig.add_subplot(221) plt.plot(thresholds, recalls, 'r-', label='recall') plt.plot(thresholds, precisions, 'b-', label='precision') plt.xlabel("z (score)") plt.legend() #PR-curve ppred = cross_val_predict(pl, X, y, cv=5, method="predict_proba")[:, 1] precisions, recalls, thresholds = precision_recall_curve(y, ppred) precisions, recalls = (a[:-1] for a in (precisions, recalls)) fig.add_subplot(222) plt.plot(precisions, recalls, label="PR-curve") plt.xlabel("precision") plt.ylabel("recall") plt.legend() #ROC-curve FPR, TPR, thresholds = roc_curve(y, ppred) AUC = roc_auc_score(y, ppred) fig.add_subplot(223) plt.plot(FPR, TPR, label="ROC-curve") plt.xlabel("False Positives Rate") plt.ylabel("True Positives Rate") plt.text(0.2, 0.7, f"AUC = {AUC.round(2)}") plt.plot([0, 1], [0, 1], '--', color='gray') plt.legend()
def mc_cv(model, xFeat, y, testSize, s): """ Evaluate the model using s samples from the Monte Carlo cross validation approach where for each sample you split xFeat into random train and test based on the testSize. Returns the model performance on the training and test datasets. Parameters ---------- model : sktree.DecisionTreeClassifier Decision tree model xFeat : nd-array with shape n x d Features of the dataset y : 1-array with shape n x 1 Labels of the dataset testSize : float Portion of the dataset to serve as a holdout. Returns ------- trainAuc : float Average AUC of the model on the training dataset testAuc : float Average AUC of the model on the validation dataset timeElapsed: float Time it took to run this function """ trainAuc = 0 testAuc = 0 timeElapsed = 0 # TODO FILL IN timeElapsed = time.time() xFeat = np.asarray(xFeat) y = np.asarray(y) # Repeats the same process but uses the random shuffle ss = ShuffleSplit(n_splits=s, test_size=testSize, random_state=0) for train_index, test_index in ss.split(xFeat): xTrain, xTest = xFeat[train_index], xFeat[test_index] yTrain, yTest = y[train_index], y[test_index] trainModel = model.fit(xTrain, yTrain) predictTrain = trainModel.predict_proba(xTrain) predictTrain = predictTrain[:, 1] fpr1, tpr1, thresholds = metrics.roc_curve(yTrain, predictTrain) trainAuc += metrics.auc(fpr1, tpr1) predictTest = trainModel.predict_proba(xTest) fpr1, tpr1, thresholds = metrics.roc_curve(yTest, predictTest[:, 1]) testAuc += metrics.auc(fpr1, tpr1) trainAuc /= ss.get_n_splits(xFeat) testAuc /= ss.get_n_splits(xFeat) timeElapsed = time.time() - timeElapsed return trainAuc, testAuc, timeElapsed
def kfold_cv(model, xFeat, y, k): """ Split xFeat into k different groups, and then use each of the k-folds as a validation set, with the model fitting on the remaining k-1 folds. Return the model performance on the training and validation (test) set. Parameters ---------- model : sktree.DecisionTreeClassifier Decision tree model xFeat : nd-array with shape n x d Features of the dataset y : 1-array with shape n x 1 Labels of the dataset k : int Number of folds or groups (approximately equal size) Returns ------- trainAuc : float Average AUC of the model on the training dataset testAuc : float Average AUC of the model on the validation dataset timeElapsed: float Time it took to run this function """ trainAuc = 0 testAuc = 0 timeElapsed = 0 # TODO FILL IN timeElapsed = time.time() xFeat = np.asarray(xFeat) y = np.asarray(y) kf = KFold(n_splits=k) kf.get_n_splits(xFeat) # Loops through all splits and repeates the process for above for train_index, test_index in kf.split(xFeat): xTrain, xTest = xFeat[train_index], xFeat[test_index] yTrain, yTest = y[train_index], y[test_index] trainModel = model.fit(xTrain, yTrain) predictTrain = trainModel.predict_proba(xTrain) predictTrain = predictTrain[:, 1] fpr1, tpr1, thresholds = metrics.roc_curve(yTrain, predictTrain) trainAuc += metrics.auc(fpr1, tpr1) predictTest = trainModel.predict_proba(xTest) predictTest = predictTest[:, 1] fpr1, tpr1, thresholds = metrics.roc_curve(yTest, predictTest) testAuc += metrics.auc(fpr1, tpr1) trainAuc /= kf.get_n_splits(xFeat) testAuc /= kf.get_n_splits(xFeat) timeElapsed = time.time() - timeElapsed return trainAuc, testAuc, timeElapsed
dt_t = dt.iloc[:, [0]] dt_p = dt.iloc[:, [1]] knn_t = knn.iloc[:, [0]] knn_p = knn.iloc[:, [1]] lr_t = lr.iloc[:, [0]] lr_p = lr.iloc[:, [1]] rf_t = rf.iloc[:, [0]] rf_p = rf.iloc[:, [1]] import sklearn.metrics as metrics # calculate the fpr and tpr for all thresholds of the classification fpr1, tpr1, threshold1 = metrics.roc_curve(ann_t, ann_p) roc_auc1 = metrics.auc(fpr1, tpr1) fpr2, tpr2, threshold2 = metrics.roc_curve(dt_t, dt_p) roc_auc2 = metrics.auc(fpr2, tpr2) fpr3, tpr3, threshold1 = metrics.roc_curve(knn_t, knn_p) roc_auc3 = metrics.auc(fpr3, tpr3) fpr4, tpr4, threshold4 = metrics.roc_curve(lr_t, lr_p) roc_auc4 = metrics.auc(fpr4, tpr4) fpr5, tpr5, threshold5 = metrics.roc_curve(rf_t, rf_p) roc_auc5 = metrics.auc(fpr5, tpr5) # method I: plt plt.title('Receiver Operating Characteristic') plt.plot(fpr1, tpr1, 'r', label = 'ANN(AUC = %0.2f)' % roc_auc1) plt.plot(fpr2, tpr2, 'g', label = 'DT(AUC = %0.2f)' % roc_auc2) plt.plot(fpr3, tpr3, 'y', label = 'KNN(AUC = %0.2f)' % roc_auc3)
def avaliacao_PerformanceC(df_train_class, predicted_train, predicted_prob_train, df_test_class, predicted_test, predicted_prob_test, roc_y_n): ### Confusion Matrix confusion_matrix_train = confusion_matrix(df_train_class, predicted_train) confusion_matrix_test = confusion_matrix(df_test_class, predicted_test) print("\nTraining Confusion Matrix:\n ", confusion_matrix_train) print("\nTesting Confusion Matrix:\n ", confusion_matrix_test) ### Accuracy score score_train = accuracy_score(df_train_class, predicted_train) score_test = accuracy_score(df_test_class, predicted_test) print("\nTraining Accuracy Score: ", score_train) print("\nTesting Accuracy Score: ", score_test) ### Precision, Recall precision_train = precision_score(df_train_class, predicted_train) precision_test = precision_score(df_test_class, predicted_test) print("\nTraining Precision: ", precision_train) print("\nTesting Precision: ", precision_test) recall_train = recall_score(df_train_class, predicted_train) recall_test = recall_score(df_test_class, predicted_test) print("\nTraining Recall: ", recall_train) print("\nTesting Recall: ", recall_test) ### Classification Report print("\nTrain Classification Report: \n", classification_report(df_train_class, predicted_train)) print("\nTest Classification Report: \n", classification_report(df_test_class, predicted_test)) ### F1 Score f1score_train = f1_score(df_train_class, predicted_train) #, average='weighted') f1score_test = f1_score(df_test_class, predicted_test) #, average='weighted') print("\nTraining F1score: ", f1score_train) print("\nTesting F1score: ", f1score_test) f1score_train = f1_score(df_train_class, predicted_train, average='weighted') f1score_test = f1_score(df_test_class, predicted_test, average='weighted') print("\nTraining Weigted F1score: ", f1score_train) print("\nTesting Weighted F1score: ", f1score_test) ### ROC-AUC if roc_y_n == 'y': fpr, tpr, threshold = roc_curve(df_train_class, predicted_prob_train[:, 1]) roc_auc_train = auc(fpr, tpr) print("\nTraining AUC for ROC: ", roc_auc_train) plt.figure() plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc_train) plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc='lower right') plt.title('Training - Receiver Operating Characteristic') fpr, tpr, threshold = roc_curve(df_test_class, predicted_prob_test[:, 1]) roc_auc_test = auc(fpr, tpr) print("\nTesting AUC for ROC: ", roc_auc_test) plt.figure() plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc_test) plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc='lower right') plt.title('Testing - Receiver Operating Characteristic')
data_x['EMERGENCY'] = lbl.fit_transform(data_x['EMERGENCY'].astype(str))#将提示的包含错误数据类型这一列进行转换 data_y = data.iloc[:,[0]] # 准备一个train/test来构建模型。 x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=52, ) print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) feature_ami = ['gender','admission','ICD9_CODE','Age','GCS_min','Urine_max','Urine_min','Urine_mean','PaO2_mean','Abnormal_HR_P','WBC_max','WBC_min','Tep_max','Tep_min','Tep_range','Tep_var','USBP_max','USBP_min','USBP_range','USBP_var','HR_range','HR_max','HR_min','HR_var','Bil_max','Bil_min','K_max','K_min','Na_max','Na_min','urea_max','urea_min','SBL_max','SBL_min','SBP_max','SBP_min','SBP_range','SBP_var','creatinine'] gbm = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=5, max_depth=5, learning_rate=0.03, n_estimators=400,feature_fraction=0.9,min_data_in_leaf=4) # gbm.fit(x_train, y_train, feature_name=feature_ami,categorical_feature=['gender','admission']) gbm.fit(x_train, y_train, categorical_feature=[1,2]) y_pred_gbm = gbm.predict(x_test) y_pred_gbm_pr = gbm.predict_proba(x_test)[:,1] fpr_gbm,tpr_gbm,thresholds = roc_curve(y_test,y_pred_gbm_pr) # 评价指标 print("auc面积:",roc_auc_score(y_test, y_pred_gbm_pr)) print("精确率:",precision_score(y_test, y_pred_gbm)) print("召回率:",recall_score(y_test, y_pred_gbm)) print("正确率:",accuracy_score(y_test, y_pred_gbm)) print("F1值:",f1_score(y_test, y_pred_gbm))
tra_label = classifier.predict(train_data) # 训练集的预测标签 tes_label = classifier.predict(test_data) # 测试集的预测标签 print("训练集:", accuracy_score(train_label, tra_label)) print("测试集:", accuracy_score(test_label, tes_label)) matrix = confusion_matrix(train_label, tra_label, labels=[0, 1]) TP = matrix[1, 1] TN = matrix[0, 0] FP = matrix[0, 1] FN = matrix[1, 0] sn = TP / (TP + FN) sp = TN / (TN + FP) decision_score = classifier.predict_proba(test_data) fprs, tprs, thresholds = roc_curve(test_label, decision_score[:, 1]) # plt.plot(fprs, tprs) # plt.show() roc_auc = auc(fprs, tprs) plt.figure() lw = 2 plt.plot(fprs, tprs, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate')
def sim_same_and_diff_category_samples(self, df, cat_index=1, dist_type='cosine', equal_var=False, plot_roc=True, precalc_dist=False, calc_roc=True): ''' Calculate the similarity of samples from the same and different categories. The cat_index gives the index of the category, where 1 in the first category ''' cols = df.columns.tolist() if type(precalc_dist) == bool: # compute distnace between rows (transpose to get cols as rows) dist_arr = 1 - pdist(df.transpose(), metric=dist_type) else: dist_arr = precalc_dist # generate sample names with categories sample_combos = list(combinations(range(df.shape[1]), 2)) sample_names = [ str(ind) + '_same' if cols[x[0]][cat_index] == cols[x[1]][cat_index] else str(ind) + '_different' for ind, x in enumerate(sample_combos) ] ser_dist = pd.Series(data=dist_arr, index=sample_names) # find same-cat sample comparisons same_cat = [x for x in sample_names if x.split('_')[1] == 'same'] # find diff-cat sample comparisons diff_cat = [x for x in sample_names if x.split('_')[1] == 'different'] # make series of same and diff category sample comparisons ser_same = ser_dist[same_cat] ser_same.name = 'Same Category' ser_diff = ser_dist[diff_cat] ser_diff.name = 'Different Category' sim_dict = {} roc_data = {} sim_data = {} sim_dict['same'] = ser_same sim_dict['diff'] = ser_diff pval_dict = {} ttest_stat, pval_dict['ttest'] = ttest_ind(ser_diff, ser_same, equal_var=equal_var) ttest_stat, pval_dict['mannwhitney'] = mannwhitneyu(ser_diff, ser_same) if calc_roc: # calc AUC true_index = list(np.ones(sim_dict['same'].shape[0])) false_index = list(np.zeros(sim_dict['diff'].shape[0])) y_true = true_index + false_index true_val = list(sim_dict['same'].get_values()) false_val = list(sim_dict['diff'].get_values()) y_score = true_val + false_val fpr, tpr, thresholds = roc_curve(y_true, y_score) inst_auc = auc(fpr, tpr) if plot_roc: plt.figure() plt.plot(fpr, tpr) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.figure(figsize=(10, 10)) print('AUC', inst_auc) roc_data['true'] = y_true roc_data['score'] = y_score roc_data['fpr'] = fpr roc_data['tpr'] = tpr roc_data['thresholds'] = thresholds roc_data['auc'] = inst_auc sim_data['sim_dict'] = sim_dict sim_data['pval_dict'] = pval_dict sim_data['roc_data'] = roc_data return sim_data
def plot_roc_curve(y_test, y_pred, title=None, micro=False, macro=True, per_class=False): if y_test.ndim == 2: num_instances, num_classes = y_test.shape else: num_instances = y_test.shape[0] num_classes = 1 if (num_classes != 2) and (y_test.ndim == 1): bi_y_test = label_binarize(y_test, classes=range(num_classes)) else: bi_y_test = y_test fpr = {} tpr = {} roc_auc = {} for i in range(num_classes): fpr[i], tpr[i], _ = roc_curve(bi_y_test[:, i], y_pred[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr['micro'], tpr['micro'], _ = roc_curve(y_test.ravel(), y_pred.ravel()) roc_auc['micro'] = auc(fpr['micro'], tpr['micro']) # Compute macro-average ROC curve and AUC # Aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)])) # Interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(num_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Average and compute AUC mean_tpr /= num_classes fpr['macro'] = all_fpr tpr['macro'] = mean_tpr roc_auc['macro'] = auc(fpr['macro'], tpr['macro']) # Plot all ROC curves plt.figure(figsize=(10, 10)) if per_class == True: for i in range(num_classes): plt.plot(fpr[i], tpr[i], alpha=0.2, label='ROC curve of class {0} (area = {1:0.4f})' ''.format(i+1, roc_auc[i])) if micro == True: plt.plot(fpr['micro'], tpr['micro'], label='micro-average ROC curve (area = {0:0.4f})' ''.format(roc_auc['micro']), color='orangered', linestyle=':', linewidth=3) if macro == True: plt.plot(fpr['macro'], tpr['macro'], label='macro-average ROC curve (area = {0:0.4f})' ''.format(roc_auc['macro']), color='navy', linestyle=':', linewidth=3) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xticks(fontsize=13) plt.xticks(fontsize=13) plt.xlabel('False Positive Rate', fontsize=16) plt.ylabel('True Positive Rate', fontsize=16) if type(title) == str: plt.title(title, fontsize=16) elif title != None: print('Title must be a string.') plt.title('ROC Curves', fontsize=16) else: plt.title('ROC Curves', fontsize=16) plt.legend(loc=4) plt.show()
del Train_Final['Response'] Test_Response = test_numeric['Response'] del test_numeric['Response'] X = Train_Final y = Train_Response X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42) from sklearn.ensemble import RandomForestClassifier clf1 = RandomForestClassifier(n_estimators=100) clf1.fit(X_train, y_train) Ans = pd.DataFrame(clf1.predict(X_test)) print(Ans) clf1.score(X_test, y_test) #clf1.score(test_numeric,Test_Response) fpr, tpr, thresholds = metrics.roc_curve(y_test, Ans) print(fpr) print(tpr) print(thresholds) FPR1 = plt.plot(fpr, label="FPR") #Blue TPR1 = plt.plot(tpr, label="TPR") #Green #plt.legend(handles=[FPR1, TPR1],loc='best') plt.show() precision_score(y_test, Ans, average='macro') plt.plot(fpr, tpr) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic Curve') plt.legend(loc="lower right") plt.show()
for run in range(n_runs): cur_results = np.load(path_to_results + 'results_{}_run.npz'.format(run)) cur_data = np.load(path_to_data + 'train_test_data_{}.npz'.format(run)) all_train_accuracy[run] = cur_results['train_accuracy'] all_test_accuracy[run] = cur_results['test_accuracy'] all_test_cm[run] = compute_confusion_matrix( np.argmax(cur_data['test_labels'], axis=1), np.argmax(cur_results['test_predicted_probs'], axis=1), normalise=True) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve( np.argmax(cur_data['test_labels'], axis=1), cur_results['test_predicted_probs'][:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 all_test_auc[run] = auc(fpr, tpr) mean_train_accuracy = np.mean(all_train_accuracy, axis=0) mean_test_accuracy = np.mean(all_test_accuracy, axis=0) mean_test_cm = np.mean(all_test_cm, axis=0) mean_test_auc = np.mean(all_test_auc) std_train_accuracy = np.std(all_train_accuracy, axis=0) std_test_accuracy = np.std(all_test_accuracy, axis=0) std_test_cm = np.std(all_test_cm, axis=0) std_test_auc = np.std(all_test_auc)