Exemplo n.º 1
0
def test_epoch(valid_loader, device, model, total_num):
    all_labels = []
    all_res = []
    all_pres = []
    all_recs = []
    all_pred = []
    model.eval()
    total_loss = 0
    total_correct = 0
    cnt_per_class = np.zeros(class_num)
    with torch.no_grad():
        for batch in tqdm(valid_loader,
                          mininterval=0.5,
                          desc='- (Validation)  ',
                          leave=False):
            sig, fea_plus, label, = map(lambda x: x.to(device), batch)
            # forward
            pred = model(
                sig,
                fea_plus)  # emd.contiguous().view(len(label), fea_num, -1)
            all_labels.extend(label.cpu().numpy())
            all_res.extend(pred.max(1)[1].cpu().numpy())
            all_pred.extend(pred.cpu().numpy())
            loss, n_correct, cnt = cal_loss(pred, label, device)

            total_loss += loss.item()
            total_correct += n_correct
            cnt_per_class += cnt

    # np.savetxt('all_pres.txt',all_pres)
    # np.savetxt('all_recs.txt', all_recs)
    np.savetxt('all_pred.txt', all_pred)
    np.savetxt('all_label.txt', all_labels)
    all_pred = np.array(all_pred)
    plot_roc(all_labels, all_pred)
    cm = confusion_matrix(all_labels, all_res)
    print(cm)
    acc_SP, pre_i, rec_i, F1_i = cal_statistic(cm)
    print('acc_SP is : {acc_SP}'.format(acc_SP=acc_SP))
    print('pre_i is : {pre_i}'.format(pre_i=pre_i))
    print('rec_i is : {rec_i}'.format(rec_i=rec_i))
    print('F1_i is : {F1_i}'.format(F1_i=F1_i))
    test_acc = total_correct / total_num
    print('test_acc is : {test_acc}'.format(test_acc=test_acc))
Exemplo n.º 2
0
    tot = 0
    for i in range(5):
        rf = RandomForestClassifier(n_estimators=n, random_state=42)
        rf.fit(X_train, y_train)
        tot += rf.score(X_test, y_test)
    accuracies.append(tot / 5)
    fig, ax = plt.subplots()
    ax.plot(num_trees, accuracies)
    ax.set_xlabel("Number of Trees")
    ax.set_ylabel("Accuracy")
    ax.set_title('Accuracy vs Num Trees')


    #Adjust the kwargs/parameters accordingly to see if we can get a better model
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    #adjust this parameters 
    num_estimators = 10
    rf = RandomForestClassifier(n_estimators = num_estimators)
    rf.fit(X_train, y_train)
    predictions = rd.predict(X_test)
    score = rf.score(X_test, y_test)
    precision = metrics.precision_score(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_true = y_test, y_pred = predictions)
    print (f'At {num_etimators}: \n Confusion Matrix: {conf_matrix}, \n Accuracy = {score}, \n Precision = {precision}, \n Recall = {recall}')


    from roc import plot_roc
    plot_roc(X, y, RandomForestClassifier, 'Random_Forest', n_estimators=25, max_features=5)
    plot_roc(X, y, LogisticRegression, 'Logistic_Regression')
    plot_roc(X, y, DecisionTreeClassifier, 'Decision_Tree')
def uploaded_file(filename):
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        # if user does not select file, browser also
        # submit a empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            print(filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            return redirect(url_for('uploaded_HTS', filename=filename))
    elif request.method == 'GET':
        # make a pd.dataframe of training data
        df = pd.read_csv(os.path.join(app.config['UPLOAD_FOLDER'], filename))
        # use all features and yfill (no NaNs, filled with 0)
        features, yfill = proc.features_yfill(df)
        #train test split at 20%
        X_train, X_test, y_train, y_test = rf.train_test_split(features,
                                                               yfill,
                                                               test_size=0.20,
                                                               random_state=1,
                                                               stratify=yfill)

        #Optional: oversampling of minority class for training purposes
        #X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3)
        #rffit, y_predict = rf.randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample')

        #fit the Random Forest classifier: would like to add in a grid search
        rffit, y_predict = rf.randomforest(X_train.values, X_test,
                                           y_train.values, y_test)

        # Use below to run a grid search .... takes to long to work right now
        #rffit, y_predict = rf.randomforest(X_train.values, X_test, y_train.values, y_test, grid_search = 'small')

        #pickle the fit model for use with test data
        proc._pickle(rffit, 'RFC_fit.pkl')

        #set_threshold_recall is a function which determines the threshold to set such that recall is optimized (the median of the available thresholds that return the second best recall (not 1.0))
        precision_list, recall_list, median_recall_index, medianrecall_threshold = rf.set_threshold_recall(
            rffit, X_train, X_test, y_train, y_test)

        #print_threshold uses the trained model and the selected threshold (in this case recall optimized) to return listed statistics
        precision, recall, fpr, fpr_test, tpr_test, cm = rf.print_threshold(
            rffit, X_train, X_test, y_train, y_test, medianrecall_threshold)
        r_cm = pd.DataFrame(cm)
        proc._pickle(medianrecall_threshold, 'medianrecall_threshold.pkl')

        #make a pd.dataframe of the stats for display
        recall_opt_stats = pd.DataFrame([[
            format(medianrecall_threshold, '.2f'),
            format(recall, '.2f'),
            format(fpr, '.2f'),
            format(precision, '.2f'),
        ]],
                                        columns=[
                                            'Suggested Threshold',
                                            'True Positive Rate (Recall)',
                                            'False Positive Rate (Fall-out)',
                                            'Precision'
                                        ])

        # repeat the threshold selection process for precision optimization
        p_precision, p_recall, p_median_precision, threshold_precision = rf.set_threshold_precision(
            rffit, X_train, X_test, y_train, y_test)
        p_precision, p_recall, p_fpr, p_fpr_test, p_tpr_test, p_cm = rf.print_threshold(
            rffit, X_train, X_test, y_train, y_test, threshold_precision)
        p_cm = pd.DataFrame(p_cm)
        precision_opt_stats = pd.DataFrame(
            [[
                format(threshold_precision, '.2f'),
                format(p_recall, '.2f'),
                format(p_fpr, '.2f'),
                format(p_precision, '.2f'),
            ]],
            columns=[
                'Suggested Threshold', 'True Positive Rate (Recall)',
                'False Positive Rate (Fall-out)', 'Precision'
            ])

        #produce a ROC plot
        test_prob = rffit.predict_proba(X_test)
        roc.plot_roc(X_train.values,
                     y_train.values,
                     y_test,
                     test_prob,
                     'Test',
                     RandomForestClassifier,
                     max_depth=10,
                     max_features=30,
                     min_samples_leaf=2,
                     min_samples_split=2)
        feature_description = rf.plot_features(features,
                                               rffit,
                                               'Identifier',
                                               n=10)
        #option for overfit data
        #roc.plot_roc(X_train_over, y_train_over, y_test, test_prob, 'Test', RandomForestClassifier,  max_depth = 10, max_features= 30, min_samples_leaf= 2, min_samples_split = 2)
        #roc.simple_roc(y_test, test_prob, 'ROC_RFC')
        pd.set_option('display.max_colwidth', -1)

        return render_template("rock.html",
                               data_recall_opt=recall_opt_stats.to_html(
                                   index=False, classes="data_recall_opt"),
                               data_precision_opt=precision_opt_stats.to_html(
                                   index=False, classes="data_precision_opt"),
                               rocname='Test',
                               f_descrip=feature_description.to_html(
                                   index=False, classes="f_descrip"),
                               recall_cm=r_cm.to_html(classes="cm"),
                               precision_cm=p_cm.to_html(classes="cm"))
plt.plot(num_features, accuracies)
plt.show()
## Levels off around 5-6

# 16. Run all the other classifiers that we have learned so far in class
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), \
           precision_score(y_test, y_predict), \
           recall_score(y_test, y_predict)

print "16. Model, Accuracy, Precision, Recall"
print "    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test, n_estimators=25, max_features=5)
print "    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test)
print "    Decision Tree:", get_scores(DecisionTreeClassifier, X_train, X_test, y_train, y_test)
print "    SVM:", get_scores(SVC, X_train, X_test, y_train, y_test)
print "    Naive Bayes:", get_scores(MultinomialNB, X_train, X_test, y_train, y_test)
## MODEL               ACCURACY PRECISION    RECALL
## Random Forest         0.9508    0.8817    0.7321
## Logistic Regression   0.8741    0.6129    0.1696
## Decision Tree         0.9209    0.6949    0.7321

print "17. Use the included `plot_roc` function to visualize the roc curve of each model"
plot_roc(X, y, RandomForestClassifier, n_estimators=25, max_features=5)
plot_roc(X, y, LogisticRegression)
plot_roc(X, y, DecisionTreeClassifier)
plot_roc(X, y, SVC)
plot_roc(X, y, MultinomialNB)
Exemplo n.º 5
0
logit_model.fit(X_train, y_train)

'''With a fpr of 0.2, we should prefer RandomForest model over logistic regression,
because RandomForest has a higher tpr of 0.85 to 0.64.'''


if __name__ == '__main__':
    print "Accuracy score for random forest: ", acc_score
    print "Precision score: ", prec_score
    print "Recall score: ", rec_score
    print "Confusion matrix: ", conf_m
    print "Out-of-bag score: ", acc_score_oob
    print "Feature importances: ", importances
    print "Important features: ", imp_features

    plot_roc(X_train, y_train, RandomForestClassifier, n_estimators=20)
    plot_roc(X_train, y_train, LogisticRegression)
    plt.show()

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(10), importances[indices], yerr=std[indices], color="r", align="center")
    plt.xticks(range(10), indices)
    plt.xlim([-1, 10])
    plt.savefig('13_Feature_ranking.png')
    plt.close()
    print('\nPlotted 13) feature importances')

    plt.plot(num_features, accuracies)
    plt.savefig('15_accuracy_vs_num_features.png')
Exemplo n.º 6
0
                     max_features=num)
    accuracies_features.append(res[2])

plt.plot(num_features, accuracies_features, "o")
plt.show()

model_names = [
    LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier
]

models = [
    fit_model(X_train, y_train, X_test, y_test, model) for model in model_names
]

forest = fit_forest(X_train,
                    y_train,
                    X_test,
                    y_test,
                    num_trees=300,
                    oob_score=False,
                    max_features=7)

for model in model_names:
    plot_roc(X, y, model)
plot_roc(X,
         y,
         RandomForestClassifier,
         n_estimators=300,
         oob_score=False,
         max_features=7)
Exemplo n.º 7
0
    lambda p: np.random.multinomial(n=2000, pvals=p), axis=1, arr=pvals)
mrsamp_table = multiplicative_replacement(samp_table)
lrsamp_table = coverage_replacement(samp_table)
rrsamp_table = coverage_replacement(samp_table, uncovered_estimator=robbins)

pearson_corr_mat = abs(np.corrcoef(samp_table.T))
spearman_corr_mat = abs(spearmanr(samp_table)[0])
zheng_corr_mat = get_corr_matrix(samp_table, zheng)
rrzheng_corr_mat = get_corr_matrix(rrsamp_table, zheng)

metric_df = confusion_evaluate(
    corr_mat,
    [pearson_corr_mat, spearman_corr_mat, zheng_corr_mat, rrzheng_corr_mat],
    ['Pearson', 'Spearman', 'Lovell', 'Robbins Corrected Lovell'])

roc_fig = plot_roc(metric_df, ['-ob', '-og', '-or', '-om'])
prec_fig = plot_recall(metric_df, ['-ob', '-og', '-or', '-om'])

roc_fig.savefig('../results/zeros/uniform_rare_eco_roc_curve.png')
prec_fig.savefig('../results/zeros/uniform_rare_eco_pre_recall_curve.png')

#######################################################################
#                   Random rarefaction correlation                    #
#######################################################################
font = {'family': 'normal', 'weight': 'normal', 'size': 13}

matplotlib.rc('font', **font)

samp_table = np.apply_along_axis(lambda p: np.random.multinomial(
    n=np.random.geometric(1 / 2000) + 2000, pvals=p),
                                 axis=1,
Exemplo n.º 8
0
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), \
           precision_score(y_test, y_predict), \
           recall_score(y_test, y_predict)

print "16. Model, Accuracy, Precision, Recall"
print "    Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test, n_estimators=25, max_features=5)
print "    Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test)
print "    Decision Tree:", get_scores(DecisionTreeClassifier, X_train, X_test, y_train, y_test)
print "    SVM:", get_scores(SVC, X_train, X_test, y_train, y_test)
print "    Naive Bayes:", get_scores(MultinomialNB, X_train, X_test, y_train, y_test)
## MODEL               ACCURACY PRECISION    RECALL
## Random Forest         0.9508    0.8817    0.7321
## Logistic Regression   0.8741    0.6129    0.1696
## Decision Tree         0.9209    0.6949    0.7321

print "17. Use the included `plot_roc` function to visualize the roc curve of each model"

#
# refers to the imported libraries from the first lines
#

plot_roc(X, y, RandomForestClassifier, n_estimators=25, max_features=5)
plot_roc(X, y, LogisticRegression)
plot_roc(X, y, DecisionTreeClassifier)
plot_roc(X, y, SVC,  probability=True)
#plot_roc(X, y, MultinomialNB)
plot_roc(X, y, GaussianNB)

pearson_corr_mat = abs(np.corrcoef(table.T))
spearman_corr_mat = abs(spearmanr(table)[0])
zheng_corr_mat = get_corr_matrix(table, zheng)
# Can insert sparcc_corr_mat right here.  Just need to
# 1. read text file of correlations via pandas DataFrame
# 2. extract matrix via the pandas as_matrix() command
# 3. take absolute value via pandas abs() command

metric_df = confusion_evaluate(corr_mat, [pearson_corr_mat,
                                          spearman_corr_mat,
                                          zheng_corr_mat],
                                         ['Pearson',
                                          'Spearman',
                                          'Lovell'])

roc_fig = plot_roc(metric_df, ['-ob', '-og', '-or'])
prec_fig = plot_recall(metric_df, ['-ob', '-og', '-or'])


roc_fig.savefig('%s/simple_nonzero_eco_roc_curve.png' % res_folder)
prec_fig.savefig('%s/simple_nonzero_eco_pre_recall_curve.png' % res_folder)

#######################################################################
#                        Uniform rarefaction                          #
#######################################################################
pvals = np.apply_along_axis(lambda x: x / x.sum(), axis=1, arr=table)
samp_table = np.apply_along_axis(
    lambda p: np.random.multinomial(n=10**9, pvals=p),
    axis=1, arr=pvals)

pearson_corr_mat = abs(np.corrcoef(samp_table.T))