def calculate_auroc(self, X, y_gt, n_steps=100, make_plot=True): ''' This function calculates the AUROC and plots the ROC curve. Args: X (array-like): Input testing data y_gt (array-like): Ground truth labels for testing data n_steps (int, default=100): Number of threshold steps to use when calculating. plot (bool): Boolean regarding a plot. Returns: roc_auc (float): Area under the reciever operator characterstic curve or AUROC. ''' assert self._model_trained == True, 'Model must be trained prior to calculatin auroc' # forward pass for prediction X = X.T y_hat, _ = self.forward(X) y_hat = np.squeeze(y_hat) y_gt = np.squeeze(y_gt) fpr, tpr, thresholds = skmetrics.roc_curve(y_gt, y_hat) roc_auc = skmetrics.auc(fpr, tpr) display = skmetrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc) if make_plot == True: display.plot() plt.show() return roc_auc
def plot_roc_curve(self, title, outname, **kwargs): """ This method works only for binary classification""" disp = metrics.RocCurveDisplay(**kwargs) disp.ax_.set_title(title) disp.plot() fname = "/".join((self.outpath, outname)) plt.savefig(fname)
def SVM_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame): from sklearn.svm import SVC svm_classifier = SVC(kernel='linear', probability=True) svm_classifier.fit(X_train, y_train.values.ravel()) svm_predi = svm_classifier.predict(X_test) print('confiusion matrix :\n {} \n '.format( m.confusion_matrix(y_test, svm_predi))) print('Classification report SVM \n : {} \n'.format( m.classification_report(y_test, svm_predi))) print("SVMs accuracy :", m.accuracy_score(y_test, svm_predi)) print("SVM precision : ", m.precision_score(y_test, svm_predi, pos_label=1)) print("SVM recall: ", m.recall_score(y_test, svm_predi, average='binary', pos_label=1)) print( "SVM f1 score: ", m.f1_score(y_test, svm_predi, labels=np.unique(svm_predi), pos_label=1)) ######### ROC CURVE FOR SVM ################ svm_roc = m.roc_auc_score(y_test, svm_classifier.predict_proba(X_test)[:, 1]) print("roc curve accuracy : ", svm_roc) fpr, tpr, thresh = m.roc_curve(y_test, svm_classifier.predict_proba(X_test)[:, 1], pos_label=1) figure_svm = m.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=svm_roc, estimator_name="SVM") figure_svm.plot() plt.show()
def plot_auc(best_model, y_test, y_hat, output_type='save'): ''' Plot and save ROC_AUC curve. ''' # plt.clf() # fig, ax1 = plt.subplots() # plot_roc_curve(best_model, x_test, y_test) # title = ax1.set_title(textwrap.fill(plot_name, 70)) # fig.tight_layout() # fig.subplots_adjust(top=0.75) # Name plot model_name = str(best_model).split('(')[0] plot_name = model_name # Find metrics fpr, tpr, thresholds = metrics.roc_curve(y_test, y_hat) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=plot_name) display.plot() #Save or show plot if (output_type == 'save'): plt.savefig('ROC_'+ str(plot_name) +'.png') elif (output_type == 'show'): plt.show() plt.close()
def roc_curve( y_pred: np.ndarray, y_labels: np.ndarray, fp: FreePlot, index: Union[Tuple[int], str] = (0, 0), name: Optional[str] = None, estimator_name: Optional[str] = None, style: Union[str, Iterable[str]] = "whitegrid", dict_: Optional[Dict] = None, ) -> "tpr, fpr, roc_auc": """ y_pred: the prediction y_labels: the corresponding labels of instances fp: ... index: ... name: for labelling the roc_curve, is None, use the estimator_name estimator_name: the name of classifier style: the style of seaborn dict_: the correspoding properties dict """ from sklearn import metrics fpr, tpr, thresholds = metrics.roc_curve(y_labels, y_pred) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=estimator_name) with sns.axes_style(style, dict_): display.plot(fp[index], name) return tpr, fpr, roc_auc
def iris_data_metrics_k_fold(trainX, trainY, threshold, classnames): print( "======================Classification - K-Fold Iterations (K=5)=========================\n" ) k = 5 for i in range(k): print("=============Iteration Number " + str(i + 1) + "================") k_trainX, k_trainY, k_validationX, k_validationY = k_fold_split( trainX, trainY, k, i + 1) model = LogisticRegression(trainX, trainY, np.array([0, 0, 0, 0]), 0.05, 1e-5, 1e-12, 'Iris Data') model.train() prediction = np.where( model.predict(validationX).values > threshold, 1, 0) confusion_matrix = metrics.confusion_matrix(validationY.values, prediction) tn, fp, fn, tp = confusion_matrix.ravel() print('Precision: ', metrics.precision_score(validationY.values, prediction)) print('Recall: ', metrics.recall_score(validationY.values, prediction)) print('FPR: ', fp / (fp + tn)) fpr, tpr, _ = metrics.roc_curve(validationY.values, prediction) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Iris Data') display.plot() plt.show() plot_confusion_matrix(confusion_matrix, class_names, threshold)
def _plot_roc_fold(fprs, tprs, i, ax, alpha=.5): display = metrics.RocCurveDisplay(fpr=fprs[i], tpr=tprs[i], roc_auc=metrics.auc(fprs[i], tprs[i]), estimator_name=f'Fold {i+1}') display.plot(ax=ax, alpha=alpha)
def add_entropy_roc(run, plots_dir): # Using the undocumented summary_metrics, because summary doesn't contain # histograms. entropy_id = run.summary_metrics['valid/entropy'] entropy_ood = run.summary_metrics['valid/entropy_ood'] preds_id = histogram_to_preds(entropy_id) preds_ood = histogram_to_preds(entropy_ood) targets_id = np.zeros_like(preds_id) targets_ood = np.ones_like(preds_ood) preds = np.concatenate([preds_id, preds_ood]) targets = np.concatenate([targets_id, targets_ood]) fpr, tpr, _ = metrics.roc_curve(targets, preds) roc_auc = metrics.auc(fpr, tpr) if 'valid/entropy_auc' not in run.summary: tqdm.write(f" + ROC AUC: {roc_auc}") run.summary['valid/entropy_auc'] = roc_auc if plots_dir: fig, ax = plt.subplots() metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(ax, name=run.name) # FIXME: unfortunately adding plots or images in retrospect is not supported # run.summary['valid/entropy_roc'] = wandb.Image(fig) # Save the ROC plot locally roc_dir = os.path.join(plots_dir, 'entropy_roc') os.makedirs(roc_dir, exist_ok=True) fig.savefig(os.path.join(roc_dir, f'{run.id}_{run.name}.pdf')) plt.close(fig)
def GAUSSIAN_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame): from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gnb.fit(X_train, y_train.values.ravel()) gnb_predctions = gnb.predict(X_test) print('confiusion matrix from gaussianNB :\n {} \n '.format( m.confusion_matrix(y_test, gnb_predctions))) print('Classification report GaussianNB \n : {} \n'.format( m.classification_report(y_test, gnb_predctions))) print("GaussianNB accuracy :", m.accuracy_score(y_test, gnb_predctions)) print("GaussianNB precision : ", m.precision_score(y_test, gnb_predctions)) print("GaussianNB recall: ", m.recall_score(y_test, gnb_predctions)) print("GaussianNB f1 score: ", m.f1_score(y_test, gnb_predctions, labels=np.unique(gnb_predctions))) fpr_gnb, tpr_gnb, _ = m.roc_curve(y_test, gnb.predict_proba(X_test)[:, 1]) gnb_roc_acc = m.roc_auc_score(y_test, gnb.predict_proba(X_test)[:, 1]) print("roc curve accuracy : ", gnb_roc_acc) bayes_fig = m.RocCurveDisplay(fpr=fpr_gnb, tpr=tpr_gnb, estimator_name="BAYES", roc_auc=gnb_roc_acc) bayes_fig.plot() plt.show()
def KNN_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame): KNN = KNeighborsClassifier(n_neighbors=5, weights='distance', leaf_size=20) KNN.fit(X_train, y_train.values.ravel()) # print(np.mean(cross_val_score(KNN ,X_all_train,y_all_train, cv=5))) predictions_from_knn = KNN.predict(X_test) print('confiusion matrix from knn with k=5 :\n {} \n '.format( m.confusion_matrix(y_test, predictions_from_knn))) print('Classification report from knn with k=5 \n : {} \n'.format( m.classification_report(y_test, predictions_from_knn))) print("knn k= accuracy :", m.accuracy_score(y_test, predictions_from_knn)) print("precision : ", m.precision_score(y_test, predictions_from_knn)) print("recall: ", m.recall_score(y_test, predictions_from_knn)) print( "f1 score: ", m.f1_score(y_test, predictions_from_knn, labels=np.unique(predictions_from_knn))) ########## ROC CURVE FOR KNN ################ roc_acc_knn_re = m.roc_auc_score(y_test, KNN.predict_proba(X_test)[:, 1]) print("roc curve accuracy : ", roc_acc_knn_re) fpr, tpr, thres = m.roc_curve(y_test, KNN.predict_proba(X_test)[:, 1]) figure = m.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_acc_knn_re, estimator_name='KNN') figure.plot() plt.title("roc curve kNN") plt.show()
def plot_ROC(fpr, tpr): """Plot ROC curve.""" roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='TransposonFinder') display.plot() plt.show()
def eval_from_save(output_folder): folder_path = Path(output_folder) novel_true = torch.load(folder_path / "test_novel_true.pt") novel_score = torch.load(folder_path / "test_novel_score.pt") # upsample normal data so it accounts for 3/4 of the weight, roughly the split of an episode # should affect PRC but not ROC norm_count = torch.sum(novel_true == 0) novel_count = torch.sum(novel_true == 1) weight = torch.ones_like(novel_score) weight[novel_true == 0] = 3 * novel_count / norm_count # ROC with 1 as novel target fpr, tpr, roc_threshs = metrics.roc_curve(novel_true, novel_score, sample_weight=weight) auroc = metrics.roc_auc_score(novel_true, novel_score, sample_weight=weight) metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auroc).plot() plt.savefig(folder_path / "roc.png") plt.close() print(f"AUROC: {auroc}") # TNR at TPR 95% tpr_95_ind = np.argwhere(tpr >= .95)[0] print(f"TNR @ TPR {tpr[tpr_95_ind][0]}%: {1 - fpr[tpr_95_ind][0]}") # PRC with 1 as novel target precision, recall, prc_threshs = metrics.precision_recall_curve( novel_true, novel_score, sample_weight=weight) prc_threshs = np.hstack([prc_threshs, prc_threshs[-1] + 1e-4 ]) # extra thresh to match lens av_p = metrics.average_precision_score(novel_true, novel_score, sample_weight=weight) metrics.PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=av_p).plot() plt.savefig(folder_path / "prc.png") plt.close() print(f"Average Precision: {av_p}") # recall at precision 80% precision_80_ind = np.argwhere(precision >= .8)[0] print(f"Recall(TPR) @ Precision {precision[precision_80_ind][0]}%: " + f"{recall[precision_80_ind][0]}") # precision at TPR 95% prc_tpr_95_ind = np.argwhere(prc_threshs >= roc_threshs[tpr_95_ind])[0] print( f"Precision @ TPR {tpr[tpr_95_ind][0]}%: {precision[prc_tpr_95_ind][0]}" ) # TNR at precision 80% roc_precision_80_ind = np.argwhere( roc_threshs >= prc_threshs[precision_80_ind])[-1] print(f"TNR @ Precision {precision[precision_80_ind][0]}%: " + f"{1 - fpr[roc_precision_80_ind][0]}") print(f"TPR @ Precision {precision[precision_80_ind][0]}%: " + f"{tpr[roc_precision_80_ind][0]}") return fpr, tpr, auroc, precision, recall, av_p
def plot_roc_curve(validations, predictions): for i in range(predictions.shape[1]): FP_rates, TP_rates, thresholds = roc_curve(validations[:, i], predictions[:, i]) roc_auc = metrics.auc(FP_rates, TP_rates) display = metrics.RocCurveDisplay(fpr=FP_rates, tpr=TP_rates, roc_auc=roc_auc, estimator_name='example estimator') display.plot() plt.show()
def plot_roc(t_pos, f_pos, t_neg, f_neg): """ Plot ROC curve based on previously determined false and true positives. """ f_pos_rate = ratio(f_pos, t_neg) t_pos_rate = ratio(t_pos, f_neg) roc_auc = metrics.auc(f_pos_rate, t_pos_rate) disp = metrics.RocCurveDisplay(fpr=f_pos_rate, tpr=t_pos_rate, roc_auc=roc_auc) disp = disp.plot() return disp.figure_
def plot_aurocs(runs): displays = [] fig, ax = plt.subplots() for run_setup, run_name in runs.items(): dev_preds = pd.read_csv( f"../model_checkpoints/{run_name}/meme_dev_seen_preds.csv") fpr, tpr, thresholds = metrics.roc_curve(dev_preds['gt'], dev_preds['proba']) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=run_setup) display.plot(ax) plt.show()
def AUROC_cruve(trained_NN, inputs, outputs, Fig = False): ''' This function leverage the trained neurla network to predict the probability of the target seuqnce is the candidate or not And will draw the ROC plot or return the AUROC score ''' results = [] for i in range (len(inputs)): results.append(trained_NN.test(inputs[i])[0][0]) #results = results.reshape((result.shape[0])) # draw the AUC cruve fpr, tpr, _ = skl_metrics.roc_curve(outputs, results, pos_label=1) roc_display = skl_metrics.RocCurveDisplay(fpr=fpr, tpr=tpr) # decide if the function show the score or show the plot if Fig == False: return skl_metrics.roc_auc_score(outputs, results) else: return roc_display
def iris_data_metrics(trainX, trainY, threshold, classnames): print( "======================Classification - Full Training Set=========================\n" ) model = LogisticRegression(trainX, trainY, np.array([0, 0, 0, 0]), 0.05, 1e-5, 1e-12, 'Iris Data') model.train() prediction = np.where(model.predict(validationX).values > threshold, 1, 0) confusion_matrix = metrics.confusion_matrix(validationY.values, prediction) tn, fp, fn, tp = confusion_matrix.ravel() print('Precision: ', metrics.precision_score(validationY.values, prediction)) print('Recall: ', metrics.recall_score(validationY.values, prediction)) print('FPR: ', fp / (fp + tn)) fpr, tpr, _ = metrics.roc_curve(validationY.values, prediction) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Iris Data') display.plot() plt.show() plot_confusion_matrix(confusion_matrix, class_names, threshold)
def NN_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame): from sklearn.neural_network import MLPClassifier # # #‘lbfgs’ is an optimizer in the family of quasi-Newton methods. # # #‘relu’, the rectified linear unit function, returns f(x) = max(0, x) # # # hidden layer size : the ith element represents the number of neurons in the ith hidden layer. nn = MLPClassifier(hidden_layer_sizes=(30, 30, 30), activation="relu", solver='lbfgs', alpha=1e-5, random_state=1, max_iter=1000) nn.fit(X_train, y_train.values.ravel()) nn_predictions = nn.predict(X_test) print('confiusion matrix from NNs :\n {} \n '.format( m.confusion_matrix(y_test, nn_predictions))) print('NNs report \n : {} \n'.format( m.classification_report(y_test, nn_predictions))) print("NNs accuracy :", m.accuracy_score(y_test, nn_predictions)) print("NNs precision : ", m.precision_score(y_test, nn_predictions)) print("NNs recall: ", m.recall_score(y_test, nn_predictions)) print("NNs f1 score: ", m.f1_score(y_test, nn_predictions, labels=np.unique(nn_predictions))) ##################### ROC CURVE ACCURACY ############################################# fpr_nn, tpr_nn, t_ = m.roc_curve(y_test, nn.predict_proba(X_test)[:, 1]) nn_roc_acc = m.roc_auc_score(y_test, nn.predict_proba(X_test)[:, 1]) print("roc curve accuracy : ", nn_roc_acc) nn_fig = m.RocCurveDisplay(fpr=fpr_nn, tpr=tpr_nn, estimator_name="NNs", roc_auc=nn_roc_acc) nn_fig.plot() plt.show()
def construct_eval_model(xtrn, ytrn, xtest, ytest, max_depth, option=3, attribute_value_pairs=None, bag_size=1, type=None): """ creates the requested model, trains and tests the model, and then displays the results. """ print('-' * 30) # use our bagging or boosting function if option == 0 or option == 1: # create ensemble model if option == 0: start = time.process_time() model = bagging(xtrn, ytrn, max_depth, attribute_value_pairs, bag_size) end = time.process_time() - start else: start = time.process_time() model = boosting(xtrn, ytrn, max_depth, bag_size, attribute_value_pairs) end = time.process_time() - start # Compute the test error and display the confusion matrix y_pred = [predict_example(x, model, probMode=True) for x in xtest] modelName = 'Bagging' if option == 0 else 'AdaBoost' probMode = True if probMode: fpr, tpr, thresholds = metrics.roc_curve(list(ytest), y_pred) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=type) plot = display numberOf = ': Number of bags =' if option == 0 else ": Number of learners =" print(modelName, numberOf, bag_size, ", Max Depth =", max_depth) tst_err = compute_error(list(ytest), y_pred, probMode=True) print('Test Error = {0:4.2f}%.'.format(tst_err * 100)) return plot #print('CPU Runtime: {0}'.format(end)) # use scikit learners if option == 2 or option == 3: # bagging classifier if option == 2: start = time.process_time() model = BaggingClassifier( base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=bag_size, random_state=0).fit(xtrn, ytrn) end = time.process_time() - start # boosting classifier else: start = time.process_time() model = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=bag_size, random_state=0).fit(xtrn, ytrn) end = time.process_time() - start # Compute the test error y_pred = model.predict(xtest) modelName = 'Scikit-Learn Bagging' if option == 2 else 'SciKit-Learn AdaBoostClassifier' numberOf = ': Number of bags =' if option == 2 else ": Number of learners =" print(modelName, numberOf, bag_size, ", Max Depth =", max_depth) tst_err = compute_error(list(ytest), y_pred) print('Test Error = {0:4.2f}%.'.format(tst_err * 100)) #print('CPU Runtime: {0}'.format(end)) if option == 5: tree = id3(np.transpose(xtrn), ytrn, attribute_value_pairs=attribute_value_pairs, max_depth=bag_size) model = [[1, tree]] y_pred = [predict_example(x, model) for x in xtest] modelName = 'Decision Tree Classifier, ' numberOf = 'max depth of the tree:' print(modelName, numberOf, bag_size) tst_err = compute_error(list(ytest), y_pred) print('Test Error = {0:4.2f}%.'.format(tst_err * 100)) print("-+-" * 5) print(tree) print("-+-" * 5, '\n') print('-' * 30)
#from sklearn.metrics import roc_curve, roc_auc_score import matplotlib.pyplot as plt from sklearn import metrics pred = [.99, .98, .72, .70, .65, .51, .39, .24, .11, .01] y = [1, 1, 0, 1, 1, 0, 0, 1, 0, 0] """ score = roc_auc_score(y, pred) fpr, tpr, _ = roc_curve(y, pred, drop_intermediate=False) print(score, fpr, tpr) plt.plot(fpr, tpr, marker='.', label='Curve') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend() plt.show() """ fpr, tpr, thresholds = metrics.roc_curve(y, pred) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='example estimator') display.plot() plt.show()
"""## 4. Metrics & Error Measures 4.1 Importing matplotlib for curve visualization """ from sklearn import metrics """4.2 Performing predcition on trainset by trained model""" y_train_pred = modelKM.predict(X_train) """4.3 Visualizing ROC Curve of trained model on trainset""" fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='demo estimator') display.plot() plt.show() """4.4 Calculating the accuracy on trainset""" print("Accuracy on trainset: ",metrics.accuracy_score(y_train, y_train_pred)) """4.5 Viewing confusion matirx on trainset""" print (metrics.confusion_matrix(y_train, y_train_pred)) """4.6 Viewing homogeneity score on trainset""" from sklearn.metrics.cluster import homogeneity_score print('Homogeneity score: ', homogeneity_score(y_train, y_train_pred))
def train_eval(config, exp_path): dataset = MarkerExpressionDataset(config) if dataset.data_clean is not None: with open(os.path.join(exp_path, 'dirty_data.txt'), 'w') as f: f.write('---data clean method: %s---\n' % dataset.data_clean) for marker, item in dataset.outlier_samples.items(): f.write('marker %s:\n' % marker) for class_id in dataset.classes: f.write('class %s:\n' % class_id) for sample_id in item.keys(): if item[sample_id]['class'] == class_id: f.write('\t%s\n' % sample_id) if dataset.feature_selection is not None or dataset.feature_transformation is not None: with open( os.path.join(exp_path, 'feature_selection_and_transformation.txt'), 'w') as f: if dataset.feature_selection is not None: f.write('---feature selection method: %s---\n' % dataset.feature_selection['method']) if 'kwargs' in dataset.feature_selection: f.write('---feature selection kwargs: %s---\n' % str(dataset.feature_selection['kwargs'])) if dataset.feature_transformation is not None: f.write('---feature transformation method: %s---\n' % dataset.feature_transformation['method']) if 'kwargs' in dataset.feature_transformation: f.write('---feature transformation kwargs: %s---\n' % str(dataset.feature_transformation['kwargs'])) for marker in dataset.markers: f.write('marker %s:\n' % marker) if dataset.fs_metric_params is not None: f.write( '---feature selection and transformation kwargs: %s---\n' % str(dataset.fs_metric_params[marker])) if dataset.feature_selection is not None: features = dataset.features feature_index = 0 f.write('---selected features---\n') if dataset.feature_selection['method'] == 'custom': support_flags = dataset.feature_selection['selection'][ marker] else: support_flags = dataset.feature_selector[ marker].get_support() for flag in support_flags: f.write('%s:\t%s\n' % (features[feature_index], flag)) feature_index = (feature_index + 1) % len(features) if dataset.feature_transformation is not None: components = dataset.feature_transformer[ marker].components_ f.write('---feature transformation components---:\n%s' % components.tolist()) # if 'feature_mean' in config: # feature_mean = config['feature_mean'] # coefficients = np.abs(feature_mean*components.sum(axis=0)).\ # reshape([len(dataset.features), -1]).sum(axis=0) # else: # coefficients = np.abs(components.sum(axis=0)).reshape([len(dataset.features), -1]).sum(axis=0) # coefficients = coefficients / coefficients.sum() # # f.write('---feature transformation coefficients---:\n%s' % coefficients.tolist()) threshold = config.get('threshold', 'roc_optimal') metrics_names = ['sensitivity', 'specificity', 'roc_auc_score'] metrics_avg_names = ['roc_auc_score_avg', 'roc_auc_score_avg_std'] fig, ax = plt.subplots(9, len(dataset.markers), squeeze=False, figsize=(6 * len(dataset.markers), 40)) metrics_file = open(os.path.join(exp_path, 'metrics.txt'), 'w') metrics_fig_filename = os.path.join(exp_path, 'conf_mat.png') best_params = dict() all_marker_train_metrics = [] all_marker_test_metrics = [] for i, marker in enumerate(dataset.markers): model = get_model(config) if 'model_kwargs_search' in config: # parameter search print('parameter search for marker %s...' % marker) all_x, all_y, cv_index = dataset.get_all_data(marker) best_model = GridSearchCV(model, param_grid=config['model_kwargs_search'], cv=cv_index, scoring='roc_auc_ovr') best_model.fit(all_x, all_y) best_params[marker] = best_model.best_params_ print('search done') else: best_model = model best_params[marker] = config['model_kwargs'] # run train and test train_xs = [] train_ys = [] train_ys_score = [] test_xs = [] test_ys = [] test_ys_score = [] for fold_i, (train_x, train_y, test_x, test_y) in enumerate(dataset.get_split_data(marker)): model = base.clone(model) model.set_params(**best_params[marker]) model.fit(train_x, train_y) # model.classes_ = dataset.classes train_xs += train_x train_ys += train_y test_xs += test_x test_ys += test_y train_y_score = model.predict_proba(train_x).tolist() train_ys_score += train_y_score test_y_score = model.predict_proba(test_x).tolist() test_ys_score += test_y_score # model_filename = os.path.join(exp_path, 'model', '%s_%s_fold_%d.pkl' # % (config['model'], marker, fold_i)) # maybe_create_path(os.path.dirname(model_filename)) # with open(model_filename, 'wb') as f: # pickle.dump(model, f) train_metrics = eval_results(train_ys, train_ys_score, labels=dataset.classes, average='macro', threshold=threshold, num_fold=dataset.num_fold) test_metrics = eval_results(test_ys, test_ys_score, labels=dataset.classes, average='macro', threshold=train_metrics['used_threshold'], num_fold=dataset.num_fold) all_marker_train_metrics.append(train_metrics) all_marker_test_metrics.append(test_metrics) # print metrics to console and file double_print('marker: %s' % marker, metrics_file) double_print('metrics on training set:', metrics_file) for j, class_j in enumerate(dataset.classes): log_str = '[class: %s. threshold: %1.1f] ' % ( class_j, 100 * train_metrics['used_threshold'][j]) for metrics_name in metrics_names: log_str += '%s: %1.1f. ' % (metrics_name, train_metrics[metrics_name][j]) double_print(log_str, metrics_file) for metrics_name in metrics_avg_names: double_print( '%s: %1.1f' % (metrics_name, train_metrics[metrics_name]), metrics_file) double_print('metrics on test set:', metrics_file) for j, class_j in enumerate(dataset.classes): log_str = '[class: %s. threshold: %1.1f] ' % ( class_j, 100 * test_metrics['used_threshold'][j]) for metrics_name in metrics_names: log_str += '%s: %1.1f. ' % (metrics_name, test_metrics[metrics_name][j]) double_print(log_str, metrics_file) for metrics_name in metrics_avg_names: double_print( '%s: %1.1f' % (metrics_name, test_metrics[metrics_name]), metrics_file) # generate figure current_ax = ax[0, i] dataset.plot_data_clean_distribution(current_ax, marker) current_ax.set_title('data cleaning on marker %s' % marker) current_ax = ax[1, i] contour_flag = len(train_xs[0]) == 2 # dup_reduced = list(tuple(tuple([train_xs[j] + [train_ys[j]] for j in range(len(train_xs))]))) # dup_reduced_train_xs = [item[:-1] for item in dup_reduced] # dup_reduced_train_ys = [item[-1] for item in dup_reduced] # dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys] dup_reduced_train_xs = train_x + test_x dup_reduced_train_ys = train_y + test_y dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys] classes_str = [str(item) for item in dataset.classes] plot_feature_distribution( dup_reduced_train_xs, ax=current_ax, t_sne=True, hue=dup_reduced_train_ys_str, hue_order=classes_str, style=dup_reduced_train_ys_str, style_order=classes_str, # x_lim='box', y_lim='box', x_lim='min_max_extend', y_lim='min_max_extend', contour=contour_flag, z_generator=best_model.predict) current_ax.set_title('%s trained on whole set' % marker) current_ax = ax[2, i] metrics.ConfusionMatrixDisplay( train_metrics['conf_mat'], display_labels=dataset.classes).plot(ax=current_ax) current_ax.set_title('%s on train set of all folds' % marker) current_ax = ax[3, i] for j in range(len(dataset.classes)): roc_curve = train_metrics['roc_curve'][j] roc_auc_score = train_metrics['roc_auc_score'][j] class_id = dataset.classes[j] sen = train_metrics['sensitivity'][j] / 100 spe = train_metrics['specificity'][j] / 100 metrics.RocCurveDisplay(fpr=roc_curve[0], tpr=roc_curve[1], roc_auc=roc_auc_score, estimator_name='class %s' % class_id).plot(ax=current_ax) current_ax.scatter(1 - spe, sen) current_ax = ax[4, i] table_val_list = [ dataset.classes, [100 * item for item in train_metrics['used_threshold']] ] row_labels = ['cls', 'thr'] for metrics_name in metrics_names: table_val_list.append(train_metrics[metrics_name]) row_labels.append(metrics_name[:min(3, len(metrics_name))]) additional_text = [] for metrics_name in metrics_avg_names: additional_text.append('%s: %1.1f' % (metrics_name, train_metrics[metrics_name])) additional_text.append(best_params[marker]) plot_table(table_val_list, row_labels, ax=current_ax, additional_text=additional_text) current_ax = ax[5, i] contour_flag = len(train_xs[0]) == 2 test_y_str = [str(item) for item in test_y] classes_str = [str(item) for item in dataset.classes] plot_feature_distribution( test_x, ax=current_ax, t_sne=True, hue=test_y_str, hue_order=classes_str, style=test_y_str, style_order=classes_str, # x_lim='box', y_lim='box', x_lim='min_max_extend', y_lim='min_max_extend', contour=contour_flag, z_generator=model.predict) current_ax.set_title('%s on test set of the last fold' % marker) current_ax = ax[6, i] metrics.ConfusionMatrixDisplay( test_metrics['conf_mat'], display_labels=dataset.classes).plot(ax=current_ax) current_ax.set_title('%s on test set of all folds' % marker) current_ax = ax[7, i] for j in range(len(dataset.classes)): roc_curve = test_metrics['roc_curve'][j] roc_auc_score = test_metrics['roc_auc_score'][j] class_id = dataset.classes[j] sen = test_metrics['sensitivity'][j] / 100 spe = test_metrics['specificity'][j] / 100 metrics.RocCurveDisplay(fpr=roc_curve[0], tpr=roc_curve[1], roc_auc=roc_auc_score, estimator_name='class %s' % class_id).plot(ax=current_ax) current_ax.scatter(1 - spe, sen) current_ax = ax[8, i] table_val_list = [ dataset.classes, [100 * item for item in test_metrics['used_threshold']] ] row_labels = ['cls', 'thr'] for metrics_name in metrics_names: table_val_list.append(test_metrics[metrics_name]) row_labels.append(metrics_name[:min(3, len(metrics_name))]) additional_text = [] for metrics_name in metrics_avg_names: additional_text.append('%s: %1.1f' % (metrics_name, test_metrics[metrics_name])) plot_table(table_val_list, row_labels, ax=current_ax, additional_text=additional_text) for metrics_name in metrics_avg_names: all_marker_values = [ item[metrics_name] for item in all_marker_train_metrics ] double_print( 'overall train %s: %1.1f' % (metrics_name, sum(all_marker_values) / len(all_marker_values)), metrics_file) for metrics_name in metrics_avg_names: all_marker_values = [ item[metrics_name] for item in all_marker_test_metrics ] double_print( 'overall test %s: %1.1f' % (metrics_name, sum(all_marker_values) / len(all_marker_values)), metrics_file) metrics_file.close() save_yaml(os.path.join(exp_path, 'best_params.yaml'), best_params) fig.savefig(metrics_fig_filename, bbox_inches='tight', pad_inches=1)
roc_auc = metrics.auc(fpr, tpr) treeDict[str(depth) + 'Entropy'] = { 'fpr': fpr, 'tpr': tpr, 'auc': roc_auc } print("") ax = plt.gca() for key in treeDict: info = treeDict[key] fpr = info['fpr'] tpr = info['tpr'] roc_auc = info['auc'] display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=key + fileKey) display.plot(ax=ax) plt.savefig('../6_output/Scikit/rocCurves/' + fileKey + '_SK_DTs') plt.close() # Create Neural Networks max_epoch = 500 step_sizes = [0.001, 0.01, 0.1, 1] print("************************************") print("Scikit's Neural Networks:") NNDict = {} for step in step_sizes:
targets = [] with torch.no_grad(): for image, label in test_dataset: image.unsqueeze_(0) conv_pred = conv_net(image) lr_pred = lr_model(image) conv_pred = torch.max(torch.softmax(conv_pred, dim=1), dim=1)[0].squeeze() lr_pred = torch.sigmoid(lr_pred).squeeze() conv_preds.append(conv_pred.item()) lr_preds.append(lr_pred.item()) targets.append(label) fpr, tpr, thresholds = metrics.roc_curve(targets, conv_preds) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='ConvNet') display.plot() plt.show() fpr, tpr, thresholds = metrics.roc_curve(targets, lr_preds) roc_auc = metrics.auc(fpr, tpr) display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='LR') display.plot() plt.show()
print('Binary-accuracy:\n{}'.format(binary_accuracy)) print('Balanced Binary-accuracy:\n{}'.format(binary_balanced_accuracy)) print('Binary-confusion matrix:\n{}'.format(binary_confusion_matrix)) print('Precision:\n{}'.format(precision)) print('Recall:\n{}'.format(recall)) print('F1 score:\n{}'.format(f1_score)) # calculate the accuracy, balanced accuracy score and confusion matrix of the 5-calss classification multi_accuracy = metrics.accuracy_score(multi_true, multi_pred) multi_balanced_accuracy = metrics.balanced_accuracy_score(multi_true, multi_pred) multi_confusion_matrix = metrics.confusion_matrix(multi_true, multi_pred) # output the result of the 5-calss classification print('Multi-accuracy:\n{}'.format(multi_accuracy)) print('Balanced Multi-accuracy:\n{}'.format(multi_balanced_accuracy)) print('Multi-confusion matrix:\n{}'.format(multi_confusion_matrix)) print('-------------------------------------') # plot the ROC and PRC of the ensemble learning model if regression_predictions_list != []: pred = np.clip((np.mean(regression_predictions_list, axis=0) + 0.5) / 4.0, a_min=0.0, a_max=1.0).astype(np.float64) elif multi_predictions_list != []: pred = np.clip(multi_pred.astype(np.float64) / 4.0, a_min=0.0, a_max=1.0).astype(np.float64) pred = np.squeeze(pred) true = binary_true fpr, tpr, _ = metrics.roc_curve(true, pred, pos_label=1.0) roc_display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr).plot() precision_list, recall_list, _ = metrics.precision_recall_curve(true, pred, pos_label=1.0) pr_display = metrics.PrecisionRecallDisplay(precision=precision_list, recall=recall_list).plot() plt.show()