def compare_predictions(prediction_df, s=''): ''' Compare the different models. + calculate the errors (y' - y)^2 between the prediction to real value --> pred_errs.csv :param kfold_prediction: dataframe containing all the predictions and real probabilities. :param i: when running multiple dataframes, save each with different name. :return: model_compare.csv --> dataframe of comparison between the models. ''' for c in prediction_df.columns: try: prediction_df[c] = prediction_df[c].str.replace( '[', '').str.replace(']', '').astype('float') except: pass ### --> from here combine the separate probabilities predictions to one. pa_columns = prediction_df.columns.str.contains('p_a_') pb_columns = prediction_df.columns.str.contains('p_b_') pac = prediction_df.columns[pa_columns] pbc = prediction_df.columns[pb_columns] nonrelevant_columns = prediction_df.columns[~pa_columns * ~pb_columns] df1 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pac]), axis=1) df2 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pbc]), axis=1) df2.rename(columns=dict(zip(pbc, pac)), inplace=True) df = pd.concat((df1, df2), axis=0) df.columns = df.columns.str.replace('p_a_', '') ### dataframe with the errors df_pred_errs = df.copy() real_prob = df['real'] df_bic = pd.DataFrame() for i, pred in enumerate(list(pac.str.replace('p_a_', ''))): if pred == 'real': continue p = dof_per_mode[pred] cbic = bic.bic(real_prob, df[pred], p) crmse = np.sqrt(mean_squared_error(real_prob, df[pred])) df_bic.loc[i, 'bic'] = cbic df_bic.loc[i, 'rmse'] = crmse df_bic.loc[i, 'model'] = pred df_bic.loc[i, 'dof'] = p print('model = %s | dof = %d | bic = %.2f' % (pred, p, cbic)) df_pred_errs['err_' + pred] = (df[pred] - real_prob)**2 df_bic.to_csv('data/predictions/bic%s.csv' % s, index=0) df_pred_errs.to_csv('data/predictions/pred_errs%s.csv' % s, index=0)
def test_exp_result1(self): """ test if aic() gives correct result """ y = [1, 2, 3, 4] y_pred = [5, 6, 7, 8] p = 3 ob = round(bic(y, y_pred, p), 3) exp = 15.249 assert ob == exp, 'The AIC given y = [1,2,3,4], y_pred = [5,6,7,8], and p = 3 should be 15.249 (applying statistical formula in main README, and rounded to 3 decimals)'
def compare_predictions(prediction_df): ''' Compare the different models. :param kfold_prediction: dataframe containing all the predictions and real probabilities. :return: model_compare.csv --> dataframe of comparison between the models. ''' for c in prediction_df.columns: try: prediction_df[c] = prediction_df[c].str.replace( '[', '').str.replace(']', '').astype('float') except: pass ### --> from here combine the separate probabilities predictions to one. pa_columns = prediction_df.columns.str.contains('p_a_') pb_columns = prediction_df.columns.str.contains('p_b_') pac = prediction_df.columns[pa_columns] pbc = prediction_df.columns[pb_columns] nonrelevant_columns = prediction_df.columns[~pa_columns * ~pb_columns] df1 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pac]), axis=1) df2 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pbc]), axis=1) df2.rename(columns=dict(zip(pbc, pac)), inplace=True) df = pd.concat((df1, df2), axis=0) df.columns = df.columns.str.replace('p_a_', '') real_prob = df['real'] df_bic = pd.DataFrame() for i, pred in enumerate(list(pac.str.replace('p_a_', ''))): if pred == 'real': continue p = dof_per_mode[pred] cbic = bic.bic(real_prob, df[pred], p) df_bic.loc[i, 'bic'] = cbic df_bic.loc[i, 'model'] = pred df_bic.loc[i, 'dof'] = p print('model = %s | dof = %d | bic = %.2f' % (pred, p, cbic)) df_bic.to_csv('data/predictions/model_comparison.csv', index=0)
def test_p(self): """ Raise TypeError if p is not an integer. """ with pytest.raises(TypeError): bic([1, 2, 3, 4], [5, 6, 7, 8], "c")
def test_len_grt1_2(self): """ Raise Error if elements of y or y_pred are not same lengths. """ with pytest.raises(TypeError): bic([1, 2, 3, 4], [], 3)
def test_len_grt1(self): """ Raise Error if y or y_pred less than 1 """ with pytest.raises(TypeError): bic([], [1, 2, 3, 4], 3)
def test_len_ypred_elements(self): """ Raise Error if elements of y or y_pred are not same lengths. """ with pytest.raises(TypeError): bic([1, 2, 3, 5], [1, 2, complex(1, 2), 4], 3)
def test_type_y_dict(self): """ Raise Error if elements of y or y_pred are not same lengths. """ with pytest.raises(TypeError): bic(dict(), [1, 2, 3, 4], 3)
def test_type_y_ypred4(self): """ Raise TypeError if elements of y or y_pred are not integers. """ with pytest.raises(TypeError): bic([1, 2], complex(3, 4), 3.4)
def test_p3(self): """ Raise TypeError if p is not integer """ with pytest.raises(TypeError): bic([1, 2, 3, 4], [5, 6, 7, 8], 2.86)
def test_type_y_ypred3(self): """ Raise TypeError if y or y_pred are matrices. """ with pytest.raises(TypeError): bic([[1, 2, 3, 4], [5, 6, 6, 7]], [1, 2, 3, 4], 3)
def test_input(self): """ check if input are more than the required, raise error if so. """ with pytest.raises(TypeError): bic([1], [3], [2], 3, 2)
def test_p6(self): """ Raise TypeError if p is integer if nor raise typeerror. """ with pytest.raises(TypeError): bic([1, 2, 3, 4], [5, 6, 7, 8], complex(1, 2))
def test_type_y_ypred(self): """ Raise TypeError if y and y_pred are not vector. """ with pytest.raises(TypeError): bic("a", [[1, 2], [2, 2]], 3.3)
def test_p4(self): """ Raise TypeError if p is greater than 0, if nor raise error. """ with pytest.raises(TypeError): bic([1, 2, 3, 4], [5, 6, 7, 8], 0)
def test_p2(self): """ Raise TypeError if p is not greater than 0 """ with pytest.raises(TypeError): bic([1, 2, 3, 4], [5, 6, 7, 8], -1)
def test_type_y_ypred2(self): """ Raise TypeError if elements of y or y_pred are not integers. """ with pytest.raises(TypeError): bic("c", [1, 2], 3.4)
def run_classifier(path, model='logistic_regression', seven_features=False, filterSpO2=True, load_model=False): if model == 'multi_class': data, df = read_data(path, multi_class=True, seven_features=seven_features, fitlerSpO2=filterSpO2) else: data, df = read_data(path, seven_features=seven_features, fitlerSpO2=filterSpO2) if filterSpO2: subSpO2Folder = 'FilteredSpO2/' else: subSpO2Folder = 'NoSpO2Filtering/' if seven_features: n_features = 7 else: n_features = 3 plot_data_file = 'Plots/data/precision_recall_' + str(n_features) + 'Features_' + 'filtered_' + str(filterSpO2) + '.csv' if not os.path.exists(plot_data_file): with open(plot_data_file, 'w', newline='') as file: csvwriter = csv.writer(file) csvwriter.writerow(['model', 'precisions', 'recalls', 'thresholds']) print("-----------------------------------------------") if model == 'multi_class': print(" MULTI CLASS CLASSIFIER") elif model == 'svc': print(" SUPPORT VECTOR CLASSIFIER") elif model == 'mlp': print(" MULTI-LAYER PERCEPTRON") else: print(" LOGISTIC REGRESSION") print("-----------------------------------------------") X = data[:, 0:-1] y = data[:, -1] n_split = 10 kFold = model_selection.KFold(n_splits=n_split, shuffle=True, random_state=1) f1_scores = [] bic_scores = [] i = 0 res = np.zeros((8, n_split)) tprs = [] prec_recall_curves = {'precision': [], 'recall': [], 'threshold':[]} mean_fpr = np.linspace(0, 1, 100) for train_index, test_index in kFold.split(X, y): print('Fold:',i + 1) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] y_preds_prob_total = [] y_trues = [] X_train_scale, X_test_scale = normalize(X_train, X_test, seven_features=seven_features) print('x train:',X_train_scale.shape) if model == 'svc': model_name = 'SVC' classifier = svm.SVC(gamma='auto', kernel='rbf', probability=True) elif model == 'sgd': model_name = 'SGD Classifier' classifier = linear_model.SGDClassifier(loss='log') elif model == 'ridge': model_name = 'Ridge Classifier' classifier = linear_model.RidgeClassifier() elif model == 'mlp': model_name = 'Neural Network' if seven_features: classifier = MLPClassifier((12, 8, 6, 4, 4), max_iter=200, activation='tanh', solver='adam', random_state=1, momentum=0.8) # current BEST 0.7798 accuracy else: classifier = MLPClassifier((6, 3), max_iter=200, activation='tanh', solver='adam', random_state=1, momentum=0.6) # current best else: model_name = 'Logistic Regression' classifier = linear_model.LogisticRegression() final_classifier = base.clone(classifier) if load_model: # load a pretrained model model_path = 'saved_models/classifer/' + subSpO2Folder + model_name.replace(' ', '_') + '_' + str(n_features) + '_features.joblib' print('load file from:', model_path) classifier = load(model_path) else: # train a new model classifier.fit(X_train_scale, y_train) y_pred = classifier.predict(X_test_scale) y_pred_prob = classifier.predict_proba(X_test_scale)[:,1] y_preds_prob_total.append(y_pred_prob) y_trues.append(y_test) f1_scores.append(metrics.f1_score(y_test, y_pred)) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() res[0][i] = tp / (tp + fp) # PPV res[1][i] = tp / (tp + fn) # sensitivity res[2][i] = tn / (tn + fp) # specificity res[3][i] = tn / (fn + tn) # NPV roc_auc = roc_auc_score(y_test, y_pred_prob) res[4][i] = roc_auc res[5][i] = res[1][i]/(1 - res[2][i]) # Positive Likelyhood rato res[6][i] = (1 - res[1][i]) / res[2][i] # Negative Likelyhood rato res[7][i] = metrics.accuracy_score(y_test, y_pred) bic_val = bic.bic(y_test, y_pred_prob, n_features) bic_scores.append(bic_val) aupr = metrics.average_precision_score(y_test, y_pred_prob) fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob) tprs.append(interp(mean_fpr, fpr, tpr)) #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) print('current f1:', f1_scores[-1], ' PPV:', res[0][i], ' Sensitivity:', res[1][i], ' Specificity:', res[2][i], ' NPV:', res[3][i], ' BIC:', bic_val, 'ROAUC:',res[4][i], 'Accuracy:',res[7][i]) i += 1 # train a final model using entire set if not load_model: X, X_copy = normalize(X, X, seven_features=seven_features) final_classifier.fit(X, y) final_pred = final_classifier.predict(X) print('Final F1 ', metrics.f1_score(y, final_pred)) if not os.path.exists('saved_models'): os.mkdir('saved_models') if not os.path.exists('saved_models/classifer'): os.mkdir('saved_models/classifer') model_output_path = 'saved_models/classifer/' + subSpO2Folder + model_name.replace(' ', '_') + '_' + str(n_features) + '_features.joblib' dump(final_classifier, model_output_path) res = np.mean(res, axis=1) print('\n=================================================') print(' RESULT') print("--------------------------------------------------") print(' ', model_name) print(' Total cases:', y.shape) print('Seven Features:', seven_features, ' FilterSpO2:', filterSpO2) print(' Mean F1:', np.mean(np.array(f1_scores))) print(' PPV:', res[0]) print(' Sensitivity:', res[1]) print(' Specificity:', res[2]) print(' NPV:', res[3]) print(' ROAUC:', res[4]) print(' Positive LR:', res[5]) print(' Negative LR:', res[6]) print(' BIC:', np.mean(np.array(bic_scores))) print(' Accuracy:', res[7]) print(' AUPR:', aupr) print("\nBaseline") get_baseline(np.array([df['Spo2'], df['Fio2']]).T, y) print('==================================================') if model == 'multi_class': return precision_array, recall_array, thresholds = metrics.precision_recall_curve(np.array(y_trues).flatten(), np.array(y_pred_prob).flatten()) with open(plot_data_file, 'a+', newline='') as file: csvwriter = csv.writer(file) csvwriter.writerow([model_name, str(precision_array), str(recall_array), str(thresholds)])
start = time.time() synthetic_dataset['aic_linear'] = synthetic_dataset.apply( lambda x: aic.aic(y=x.y_array, y_pred=x.y_pred_linear, p=2), axis=1) synthetic_dataset['aic_logistic'] = synthetic_dataset.apply( lambda x: aic.aic(y=x.y_array, y_pred=x.y_pred_logistic, p=3), axis=1) end = time.time() time_dict["aic"] = [end - start] # Calculating bic start = time.time() synthetic_dataset['bic_linear'] = synthetic_dataset.apply( lambda x: bic.bic(y=x.y_array, y_pred=x.y_pred_linear, p=2), axis=1) synthetic_dataset['bic_logistic'] = synthetic_dataset.apply( lambda x: bic.bic(y=x.y_array, y_pred=x.y_pred_logistic, p=3), axis=1) end = time.time() time_dict["bic"] = [end - start] time_df = pd.DataFrame.from_dict(time_dict) pickle.dump(time_df, file) file.close() # Calculating Shanon standardized BIC and AIC synthetic_dataset['shanon_bic_logistic'] = synthetic_dataset.apply( lambda x: shanon_bic(std_dev_error=np.std(x.y_array - x.y_pred_logistic), bic=x.bic_logistic,