Exemplo n.º 1
0
def compare_predictions(prediction_df, s=''):
    '''
    Compare the different models.
    + calculate the errors (y' - y)^2 between the prediction to real value --> pred_errs.csv
    :param kfold_prediction: dataframe containing all the predictions and real probabilities.
    :param i: when running multiple dataframes, save each with different name.
    :return: model_compare.csv --> dataframe of comparison between the models.
    '''

    for c in prediction_df.columns:
        try:
            prediction_df[c] = prediction_df[c].str.replace(
                '[', '').str.replace(']', '').astype('float')
        except:
            pass

    ### --> from here combine the separate probabilities predictions to one.
    pa_columns = prediction_df.columns.str.contains('p_a_')
    pb_columns = prediction_df.columns.str.contains('p_b_')
    pac = prediction_df.columns[pa_columns]
    pbc = prediction_df.columns[pb_columns]
    nonrelevant_columns = prediction_df.columns[~pa_columns * ~pb_columns]

    df1 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pac]),
                    axis=1)
    df2 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pbc]),
                    axis=1)
    df2.rename(columns=dict(zip(pbc, pac)), inplace=True)

    df = pd.concat((df1, df2), axis=0)

    df.columns = df.columns.str.replace('p_a_', '')

    ### dataframe with the errors
    df_pred_errs = df.copy()

    real_prob = df['real']

    df_bic = pd.DataFrame()

    for i, pred in enumerate(list(pac.str.replace('p_a_', ''))):
        if pred == 'real':
            continue
        p = dof_per_mode[pred]
        cbic = bic.bic(real_prob, df[pred], p)
        crmse = np.sqrt(mean_squared_error(real_prob, df[pred]))
        df_bic.loc[i, 'bic'] = cbic
        df_bic.loc[i, 'rmse'] = crmse
        df_bic.loc[i, 'model'] = pred
        df_bic.loc[i, 'dof'] = p
        print('model = %s | dof = %d | bic = %.2f' % (pred, p, cbic))

        df_pred_errs['err_' + pred] = (df[pred] - real_prob)**2

    df_bic.to_csv('data/predictions/bic%s.csv' % s, index=0)
    df_pred_errs.to_csv('data/predictions/pred_errs%s.csv' % s, index=0)
Exemplo n.º 2
0
 def test_exp_result1(self):
     """
     test if aic() gives correct result
     """
     y = [1, 2, 3, 4]
     y_pred = [5, 6, 7, 8]
     p = 3
     ob = round(bic(y, y_pred, p), 3)
     exp = 15.249
     assert ob == exp, 'The AIC given y = [1,2,3,4], y_pred = [5,6,7,8], and p = 3 should be 15.249 (applying statistical formula in main README, and rounded to 3 decimals)'
def compare_predictions(prediction_df):
    '''
    Compare the different models.
    :param kfold_prediction: dataframe containing all the predictions and real probabilities.
    :return: model_compare.csv --> dataframe of comparison between the models.
    '''

    for c in prediction_df.columns:
        try:
            prediction_df[c] = prediction_df[c].str.replace(
                '[', '').str.replace(']', '').astype('float')
        except:
            pass

    ### --> from here combine the separate probabilities predictions to one.
    pa_columns = prediction_df.columns.str.contains('p_a_')
    pb_columns = prediction_df.columns.str.contains('p_b_')
    pac = prediction_df.columns[pa_columns]
    pbc = prediction_df.columns[pb_columns]
    nonrelevant_columns = prediction_df.columns[~pa_columns * ~pb_columns]

    df1 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pac]),
                    axis=1)
    df2 = pd.concat((prediction_df[nonrelevant_columns], prediction_df[pbc]),
                    axis=1)
    df2.rename(columns=dict(zip(pbc, pac)), inplace=True)

    df = pd.concat((df1, df2), axis=0)

    df.columns = df.columns.str.replace('p_a_', '')

    real_prob = df['real']

    df_bic = pd.DataFrame()

    for i, pred in enumerate(list(pac.str.replace('p_a_', ''))):
        if pred == 'real':
            continue
        p = dof_per_mode[pred]
        cbic = bic.bic(real_prob, df[pred], p)
        df_bic.loc[i, 'bic'] = cbic
        df_bic.loc[i, 'model'] = pred
        df_bic.loc[i, 'dof'] = p
        print('model = %s | dof = %d | bic = %.2f' % (pred, p, cbic))

    df_bic.to_csv('data/predictions/model_comparison.csv', index=0)
Exemplo n.º 4
0
 def test_p(self):
     """
     Raise TypeError if p is not an integer.
     """
     with pytest.raises(TypeError):
         bic([1, 2, 3, 4], [5, 6, 7, 8], "c")
Exemplo n.º 5
0
 def test_len_grt1_2(self):
     """
     Raise Error if elements of y or y_pred are not same lengths.
     """
     with pytest.raises(TypeError):
         bic([1, 2, 3, 4], [], 3)
Exemplo n.º 6
0
 def test_len_grt1(self):
     """
     Raise Error if y or y_pred less than 1
     """
     with pytest.raises(TypeError):
         bic([], [1, 2, 3, 4], 3)
Exemplo n.º 7
0
 def test_len_ypred_elements(self):
     """
     Raise Error if elements of y or y_pred are not same lengths.
     """
     with pytest.raises(TypeError):
         bic([1, 2, 3, 5], [1, 2, complex(1, 2), 4], 3)
Exemplo n.º 8
0
 def test_type_y_dict(self):
     """
     Raise Error if elements of y or y_pred are not same lengths.
     """
     with pytest.raises(TypeError):
         bic(dict(), [1, 2, 3, 4], 3)
Exemplo n.º 9
0
 def test_type_y_ypred4(self):
     """
     Raise TypeError if elements of y or y_pred are not integers.
     """
     with pytest.raises(TypeError):
         bic([1, 2], complex(3, 4), 3.4)
Exemplo n.º 10
0
 def test_p3(self):
     """
     Raise TypeError if p is not integer
     """
     with pytest.raises(TypeError):
         bic([1, 2, 3, 4], [5, 6, 7, 8], 2.86)
Exemplo n.º 11
0
 def test_type_y_ypred3(self):
     """
     Raise TypeError if y or y_pred are matrices.
     """
     with pytest.raises(TypeError):
         bic([[1, 2, 3, 4], [5, 6, 6, 7]], [1, 2, 3, 4], 3)
Exemplo n.º 12
0
 def test_input(self):
     """
     check if input are more than the required, raise error if so.
     """
     with pytest.raises(TypeError):
         bic([1], [3], [2], 3, 2)
Exemplo n.º 13
0
 def test_p6(self):
     """
     Raise TypeError if p is integer if nor raise typeerror.
     """
     with pytest.raises(TypeError):
         bic([1, 2, 3, 4], [5, 6, 7, 8], complex(1, 2))
Exemplo n.º 14
0
 def test_type_y_ypred(self):
     """
     Raise TypeError if y and y_pred are not vector.
     """
     with pytest.raises(TypeError):
         bic("a", [[1, 2], [2, 2]], 3.3)
Exemplo n.º 15
0
 def test_p4(self):
     """
     Raise TypeError if p is greater than 0, if nor raise error.
     """
     with pytest.raises(TypeError):
         bic([1, 2, 3, 4], [5, 6, 7, 8], 0)
Exemplo n.º 16
0
 def test_p2(self):
     """
     Raise TypeError if p is not greater than 0
     """
     with pytest.raises(TypeError):
         bic([1, 2, 3, 4], [5, 6, 7, 8], -1)
Exemplo n.º 17
0
 def test_type_y_ypred2(self):
     """
     Raise TypeError if elements of y or y_pred are not integers.
     """
     with pytest.raises(TypeError):
         bic("c", [1, 2], 3.4)
Exemplo n.º 18
0
def run_classifier(path, model='logistic_regression', seven_features=False, filterSpO2=True, load_model=False):

    if model == 'multi_class':
        data, df = read_data(path, multi_class=True, seven_features=seven_features, fitlerSpO2=filterSpO2)
    else:
        data, df = read_data(path, seven_features=seven_features, fitlerSpO2=filterSpO2)

    if filterSpO2:
        subSpO2Folder = 'FilteredSpO2/'
    else:
        subSpO2Folder = 'NoSpO2Filtering/'

    if seven_features:
        n_features = 7
    else:
        n_features = 3

    plot_data_file = 'Plots/data/precision_recall_' + str(n_features) + 'Features_' + 'filtered_' + str(filterSpO2) + '.csv'
    if not os.path.exists(plot_data_file):
        with open(plot_data_file, 'w', newline='') as file:
            csvwriter = csv.writer(file)
            csvwriter.writerow(['model', 'precisions', 'recalls', 'thresholds'])

    print("-----------------------------------------------")
    if model == 'multi_class':
        print("        MULTI CLASS CLASSIFIER")
    elif model == 'svc':
        print("        SUPPORT VECTOR CLASSIFIER")
    elif model == 'mlp':
        print("   MULTI-LAYER PERCEPTRON")
    else:
        print("             LOGISTIC REGRESSION")
    print("-----------------------------------------------")


    X = data[:, 0:-1]
    y = data[:, -1]
    n_split = 10
    kFold = model_selection.KFold(n_splits=n_split, shuffle=True, random_state=1)
    f1_scores = []
    bic_scores = []
    i = 0
    res = np.zeros((8, n_split))
    tprs = []

    prec_recall_curves = {'precision': [], 'recall': [], 'threshold':[]}

    mean_fpr = np.linspace(0, 1, 100)
    for train_index, test_index in kFold.split(X, y):
        print('Fold:',i + 1)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        y_preds_prob_total = []
        y_trues = []

        X_train_scale, X_test_scale = normalize(X_train, X_test, seven_features=seven_features)
        print('x train:',X_train_scale.shape)
        if model == 'svc':
            model_name = 'SVC'
            classifier = svm.SVC(gamma='auto', kernel='rbf', probability=True)
        elif model == 'sgd':
            model_name = 'SGD Classifier'
            classifier = linear_model.SGDClassifier(loss='log')
        elif model == 'ridge':
            model_name = 'Ridge Classifier'
            classifier = linear_model.RidgeClassifier()
        elif model == 'mlp':
            model_name = 'Neural Network'
            if seven_features:
                classifier = MLPClassifier((12, 8, 6, 4, 4), max_iter=200, activation='tanh', solver='adam', random_state=1, momentum=0.8) # current BEST 0.7798 accuracy
            else:
                classifier = MLPClassifier((6, 3), max_iter=200, activation='tanh', solver='adam', random_state=1, momentum=0.6) # current best

        else:
            model_name = 'Logistic Regression'
            classifier = linear_model.LogisticRegression()

        final_classifier = base.clone(classifier)
        if load_model: # load a pretrained model
            model_path = 'saved_models/classifer/' + subSpO2Folder + model_name.replace(' ', '_') + '_'  + str(n_features) + '_features.joblib'
            print('load file from:', model_path)
            classifier = load(model_path)
        else: # train a new model
            classifier.fit(X_train_scale, y_train)

        y_pred = classifier.predict(X_test_scale)
        y_pred_prob = classifier.predict_proba(X_test_scale)[:,1]

        y_preds_prob_total.append(y_pred_prob)
        y_trues.append(y_test)

        f1_scores.append(metrics.f1_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        res[0][i] = tp / (tp + fp) # PPV
        res[1][i] = tp / (tp + fn) # sensitivity
        res[2][i] = tn / (tn + fp) # specificity
        res[3][i] = tn / (fn + tn) # NPV
        roc_auc = roc_auc_score(y_test, y_pred_prob)
        res[4][i] = roc_auc
        res[5][i] = res[1][i]/(1 - res[2][i]) # Positive Likelyhood rato
        res[6][i] = (1 - res[1][i]) / res[2][i]  # Negative Likelyhood rato
        res[7][i] = metrics.accuracy_score(y_test, y_pred)
        bic_val = bic.bic(y_test, y_pred_prob, n_features)
        bic_scores.append(bic_val)
        aupr = metrics.average_precision_score(y_test, y_pred_prob)
        fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob)

        tprs.append(interp(mean_fpr, fpr, tpr))
        #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

        print('current f1:', f1_scores[-1], ' PPV:', res[0][i], ' Sensitivity:', res[1][i],
              ' Specificity:', res[2][i], ' NPV:', res[3][i], ' BIC:', bic_val,
              'ROAUC:',res[4][i], 'Accuracy:',res[7][i])
        i += 1

    # train a final model using entire set
    if not load_model:
        X, X_copy = normalize(X, X, seven_features=seven_features)
        final_classifier.fit(X, y)
        final_pred = final_classifier.predict(X)
        print('Final F1 ', metrics.f1_score(y, final_pred))
        if not os.path.exists('saved_models'):
            os.mkdir('saved_models')

        if not os.path.exists('saved_models/classifer'):
            os.mkdir('saved_models/classifer')

        model_output_path = 'saved_models/classifer/' + subSpO2Folder + model_name.replace(' ', '_') + '_' + str(n_features) + '_features.joblib'
        dump(final_classifier, model_output_path)

    res = np.mean(res, axis=1)

    print('\n=================================================')
    print('                      RESULT')
    print("--------------------------------------------------")
    print('  ', model_name)
    print('   Total cases:', y.shape)
    print('Seven Features:', seven_features, '  FilterSpO2:', filterSpO2)
    print('       Mean F1:', np.mean(np.array(f1_scores)))
    print('           PPV:', res[0])
    print('   Sensitivity:', res[1])
    print('   Specificity:', res[2])
    print('           NPV:', res[3])
    print('         ROAUC:', res[4])
    print('   Positive LR:', res[5])
    print('   Negative LR:', res[6])
    print('           BIC:', np.mean(np.array(bic_scores)))
    print('      Accuracy:', res[7])
    print('          AUPR:', aupr)
    print("\nBaseline")
    get_baseline(np.array([df['Spo2'], df['Fio2']]).T, y)
    print('==================================================')

    if model == 'multi_class':
        return

    precision_array, recall_array, thresholds = metrics.precision_recall_curve(np.array(y_trues).flatten(), np.array(y_pred_prob).flatten())

    with open(plot_data_file, 'a+', newline='') as file:
        csvwriter = csv.writer(file)
        csvwriter.writerow([model_name, str(precision_array), str(recall_array), str(thresholds)])
Exemplo n.º 19
0
start = time.time()

synthetic_dataset['aic_linear'] = synthetic_dataset.apply(
    lambda x: aic.aic(y=x.y_array, y_pred=x.y_pred_linear, p=2), axis=1)

synthetic_dataset['aic_logistic'] = synthetic_dataset.apply(
    lambda x: aic.aic(y=x.y_array, y_pred=x.y_pred_logistic, p=3), axis=1)

end = time.time()
time_dict["aic"] = [end - start]

# Calculating bic
start = time.time()

synthetic_dataset['bic_linear'] = synthetic_dataset.apply(
    lambda x: bic.bic(y=x.y_array, y_pred=x.y_pred_linear, p=2), axis=1)

synthetic_dataset['bic_logistic'] = synthetic_dataset.apply(
    lambda x: bic.bic(y=x.y_array, y_pred=x.y_pred_logistic, p=3), axis=1)

end = time.time()
time_dict["bic"] = [end - start]

time_df = pd.DataFrame.from_dict(time_dict)
pickle.dump(time_df, file)
file.close()

# Calculating Shanon standardized BIC and AIC
synthetic_dataset['shanon_bic_logistic'] = synthetic_dataset.apply(
    lambda x: shanon_bic(std_dev_error=np.std(x.y_array - x.y_pred_logistic),
                         bic=x.bic_logistic,