def test_epoch(valid_loader, device, model, total_num): all_labels = [] all_res = [] all_pres = [] all_recs = [] all_pred = [] model.eval() total_loss = 0 total_correct = 0 cnt_per_class = np.zeros(class_num) with torch.no_grad(): for batch in tqdm(valid_loader, mininterval=0.5, desc='- (Validation) ', leave=False): sig, fea_plus, label, = map(lambda x: x.to(device), batch) # forward pred = model( sig, fea_plus) # emd.contiguous().view(len(label), fea_num, -1) all_labels.extend(label.cpu().numpy()) all_res.extend(pred.max(1)[1].cpu().numpy()) all_pred.extend(pred.cpu().numpy()) loss, n_correct, cnt = cal_loss(pred, label, device) total_loss += loss.item() total_correct += n_correct cnt_per_class += cnt # np.savetxt('all_pres.txt',all_pres) # np.savetxt('all_recs.txt', all_recs) np.savetxt('all_pred.txt', all_pred) np.savetxt('all_label.txt', all_labels) all_pred = np.array(all_pred) plot_roc(all_labels, all_pred) cm = confusion_matrix(all_labels, all_res) print(cm) acc_SP, pre_i, rec_i, F1_i = cal_statistic(cm) print('acc_SP is : {acc_SP}'.format(acc_SP=acc_SP)) print('pre_i is : {pre_i}'.format(pre_i=pre_i)) print('rec_i is : {rec_i}'.format(rec_i=rec_i)) print('F1_i is : {F1_i}'.format(F1_i=F1_i)) test_acc = total_correct / total_num print('test_acc is : {test_acc}'.format(test_acc=test_acc))
tot = 0 for i in range(5): rf = RandomForestClassifier(n_estimators=n, random_state=42) rf.fit(X_train, y_train) tot += rf.score(X_test, y_test) accuracies.append(tot / 5) fig, ax = plt.subplots() ax.plot(num_trees, accuracies) ax.set_xlabel("Number of Trees") ax.set_ylabel("Accuracy") ax.set_title('Accuracy vs Num Trees') #Adjust the kwargs/parameters accordingly to see if we can get a better model X_train, X_test, y_train, y_test = train_test_split(X, y) #adjust this parameters num_estimators = 10 rf = RandomForestClassifier(n_estimators = num_estimators) rf.fit(X_train, y_train) predictions = rd.predict(X_test) score = rf.score(X_test, y_test) precision = metrics.precision_score(y_test, predictions) recall = metrics.recall_score(y_test, predictions) conf_matrix = confusion_matrix(y_true = y_test, y_pred = predictions) print (f'At {num_etimators}: \n Confusion Matrix: {conf_matrix}, \n Accuracy = {score}, \n Precision = {precision}, \n Recall = {recall}') from roc import plot_roc plot_roc(X, y, RandomForestClassifier, 'Random_Forest', n_estimators=25, max_features=5) plot_roc(X, y, LogisticRegression, 'Logistic_Regression') plot_roc(X, y, DecisionTreeClassifier, 'Decision_Tree')
def uploaded_file(filename): if request.method == 'POST': # check if the post request has the file part if 'file' not in request.files: flash('No file part') return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': flash('No selected file') return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) print(filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) return redirect(url_for('uploaded_HTS', filename=filename)) elif request.method == 'GET': # make a pd.dataframe of training data df = pd.read_csv(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # use all features and yfill (no NaNs, filled with 0) features, yfill = proc.features_yfill(df) #train test split at 20% X_train, X_test, y_train, y_test = rf.train_test_split(features, yfill, test_size=0.20, random_state=1, stratify=yfill) #Optional: oversampling of minority class for training purposes #X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3) #rffit, y_predict = rf.randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample') #fit the Random Forest classifier: would like to add in a grid search rffit, y_predict = rf.randomforest(X_train.values, X_test, y_train.values, y_test) # Use below to run a grid search .... takes to long to work right now #rffit, y_predict = rf.randomforest(X_train.values, X_test, y_train.values, y_test, grid_search = 'small') #pickle the fit model for use with test data proc._pickle(rffit, 'RFC_fit.pkl') #set_threshold_recall is a function which determines the threshold to set such that recall is optimized (the median of the available thresholds that return the second best recall (not 1.0)) precision_list, recall_list, median_recall_index, medianrecall_threshold = rf.set_threshold_recall( rffit, X_train, X_test, y_train, y_test) #print_threshold uses the trained model and the selected threshold (in this case recall optimized) to return listed statistics precision, recall, fpr, fpr_test, tpr_test, cm = rf.print_threshold( rffit, X_train, X_test, y_train, y_test, medianrecall_threshold) r_cm = pd.DataFrame(cm) proc._pickle(medianrecall_threshold, 'medianrecall_threshold.pkl') #make a pd.dataframe of the stats for display recall_opt_stats = pd.DataFrame([[ format(medianrecall_threshold, '.2f'), format(recall, '.2f'), format(fpr, '.2f'), format(precision, '.2f'), ]], columns=[ 'Suggested Threshold', 'True Positive Rate (Recall)', 'False Positive Rate (Fall-out)', 'Precision' ]) # repeat the threshold selection process for precision optimization p_precision, p_recall, p_median_precision, threshold_precision = rf.set_threshold_precision( rffit, X_train, X_test, y_train, y_test) p_precision, p_recall, p_fpr, p_fpr_test, p_tpr_test, p_cm = rf.print_threshold( rffit, X_train, X_test, y_train, y_test, threshold_precision) p_cm = pd.DataFrame(p_cm) precision_opt_stats = pd.DataFrame( [[ format(threshold_precision, '.2f'), format(p_recall, '.2f'), format(p_fpr, '.2f'), format(p_precision, '.2f'), ]], columns=[ 'Suggested Threshold', 'True Positive Rate (Recall)', 'False Positive Rate (Fall-out)', 'Precision' ]) #produce a ROC plot test_prob = rffit.predict_proba(X_test) roc.plot_roc(X_train.values, y_train.values, y_test, test_prob, 'Test', RandomForestClassifier, max_depth=10, max_features=30, min_samples_leaf=2, min_samples_split=2) feature_description = rf.plot_features(features, rffit, 'Identifier', n=10) #option for overfit data #roc.plot_roc(X_train_over, y_train_over, y_test, test_prob, 'Test', RandomForestClassifier, max_depth = 10, max_features= 30, min_samples_leaf= 2, min_samples_split = 2) #roc.simple_roc(y_test, test_prob, 'ROC_RFC') pd.set_option('display.max_colwidth', -1) return render_template("rock.html", data_recall_opt=recall_opt_stats.to_html( index=False, classes="data_recall_opt"), data_precision_opt=precision_opt_stats.to_html( index=False, classes="data_precision_opt"), rocname='Test', f_descrip=feature_description.to_html( index=False, classes="f_descrip"), recall_cm=r_cm.to_html(classes="cm"), precision_cm=p_cm.to_html(classes="cm"))
plt.plot(num_features, accuracies) plt.show() ## Levels off around 5-6 # 16. Run all the other classifiers that we have learned so far in class def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs): model = classifier(**kwargs) model.fit(X_train, y_train) y_predict = model.predict(X_test) return model.score(X_test, y_test), \ precision_score(y_test, y_predict), \ recall_score(y_test, y_predict) print "16. Model, Accuracy, Precision, Recall" print " Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test, n_estimators=25, max_features=5) print " Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test) print " Decision Tree:", get_scores(DecisionTreeClassifier, X_train, X_test, y_train, y_test) print " SVM:", get_scores(SVC, X_train, X_test, y_train, y_test) print " Naive Bayes:", get_scores(MultinomialNB, X_train, X_test, y_train, y_test) ## MODEL ACCURACY PRECISION RECALL ## Random Forest 0.9508 0.8817 0.7321 ## Logistic Regression 0.8741 0.6129 0.1696 ## Decision Tree 0.9209 0.6949 0.7321 print "17. Use the included `plot_roc` function to visualize the roc curve of each model" plot_roc(X, y, RandomForestClassifier, n_estimators=25, max_features=5) plot_roc(X, y, LogisticRegression) plot_roc(X, y, DecisionTreeClassifier) plot_roc(X, y, SVC) plot_roc(X, y, MultinomialNB)
logit_model.fit(X_train, y_train) '''With a fpr of 0.2, we should prefer RandomForest model over logistic regression, because RandomForest has a higher tpr of 0.85 to 0.64.''' if __name__ == '__main__': print "Accuracy score for random forest: ", acc_score print "Precision score: ", prec_score print "Recall score: ", rec_score print "Confusion matrix: ", conf_m print "Out-of-bag score: ", acc_score_oob print "Feature importances: ", importances print "Important features: ", imp_features plot_roc(X_train, y_train, RandomForestClassifier, n_estimators=20) plot_roc(X_train, y_train, LogisticRegression) plt.show() # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(10), importances[indices], yerr=std[indices], color="r", align="center") plt.xticks(range(10), indices) plt.xlim([-1, 10]) plt.savefig('13_Feature_ranking.png') plt.close() print('\nPlotted 13) feature importances') plt.plot(num_features, accuracies) plt.savefig('15_accuracy_vs_num_features.png')
max_features=num) accuracies_features.append(res[2]) plt.plot(num_features, accuracies_features, "o") plt.show() model_names = [ LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier ] models = [ fit_model(X_train, y_train, X_test, y_test, model) for model in model_names ] forest = fit_forest(X_train, y_train, X_test, y_test, num_trees=300, oob_score=False, max_features=7) for model in model_names: plot_roc(X, y, model) plot_roc(X, y, RandomForestClassifier, n_estimators=300, oob_score=False, max_features=7)
lambda p: np.random.multinomial(n=2000, pvals=p), axis=1, arr=pvals) mrsamp_table = multiplicative_replacement(samp_table) lrsamp_table = coverage_replacement(samp_table) rrsamp_table = coverage_replacement(samp_table, uncovered_estimator=robbins) pearson_corr_mat = abs(np.corrcoef(samp_table.T)) spearman_corr_mat = abs(spearmanr(samp_table)[0]) zheng_corr_mat = get_corr_matrix(samp_table, zheng) rrzheng_corr_mat = get_corr_matrix(rrsamp_table, zheng) metric_df = confusion_evaluate( corr_mat, [pearson_corr_mat, spearman_corr_mat, zheng_corr_mat, rrzheng_corr_mat], ['Pearson', 'Spearman', 'Lovell', 'Robbins Corrected Lovell']) roc_fig = plot_roc(metric_df, ['-ob', '-og', '-or', '-om']) prec_fig = plot_recall(metric_df, ['-ob', '-og', '-or', '-om']) roc_fig.savefig('../results/zeros/uniform_rare_eco_roc_curve.png') prec_fig.savefig('../results/zeros/uniform_rare_eco_pre_recall_curve.png') ####################################################################### # Random rarefaction correlation # ####################################################################### font = {'family': 'normal', 'weight': 'normal', 'size': 13} matplotlib.rc('font', **font) samp_table = np.apply_along_axis(lambda p: np.random.multinomial( n=np.random.geometric(1 / 2000) + 2000, pvals=p), axis=1,
model.fit(X_train, y_train) y_predict = model.predict(X_test) return model.score(X_test, y_test), \ precision_score(y_test, y_predict), \ recall_score(y_test, y_predict) print "16. Model, Accuracy, Precision, Recall" print " Random Forest:", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test, n_estimators=25, max_features=5) print " Logistic Regression:", get_scores(LogisticRegression, X_train, X_test, y_train, y_test) print " Decision Tree:", get_scores(DecisionTreeClassifier, X_train, X_test, y_train, y_test) print " SVM:", get_scores(SVC, X_train, X_test, y_train, y_test) print " Naive Bayes:", get_scores(MultinomialNB, X_train, X_test, y_train, y_test) ## MODEL ACCURACY PRECISION RECALL ## Random Forest 0.9508 0.8817 0.7321 ## Logistic Regression 0.8741 0.6129 0.1696 ## Decision Tree 0.9209 0.6949 0.7321 print "17. Use the included `plot_roc` function to visualize the roc curve of each model" # # refers to the imported libraries from the first lines # plot_roc(X, y, RandomForestClassifier, n_estimators=25, max_features=5) plot_roc(X, y, LogisticRegression) plot_roc(X, y, DecisionTreeClassifier) plot_roc(X, y, SVC, probability=True) #plot_roc(X, y, MultinomialNB) plot_roc(X, y, GaussianNB)
pearson_corr_mat = abs(np.corrcoef(table.T)) spearman_corr_mat = abs(spearmanr(table)[0]) zheng_corr_mat = get_corr_matrix(table, zheng) # Can insert sparcc_corr_mat right here. Just need to # 1. read text file of correlations via pandas DataFrame # 2. extract matrix via the pandas as_matrix() command # 3. take absolute value via pandas abs() command metric_df = confusion_evaluate(corr_mat, [pearson_corr_mat, spearman_corr_mat, zheng_corr_mat], ['Pearson', 'Spearman', 'Lovell']) roc_fig = plot_roc(metric_df, ['-ob', '-og', '-or']) prec_fig = plot_recall(metric_df, ['-ob', '-og', '-or']) roc_fig.savefig('%s/simple_nonzero_eco_roc_curve.png' % res_folder) prec_fig.savefig('%s/simple_nonzero_eco_pre_recall_curve.png' % res_folder) ####################################################################### # Uniform rarefaction # ####################################################################### pvals = np.apply_along_axis(lambda x: x / x.sum(), axis=1, arr=table) samp_table = np.apply_along_axis( lambda p: np.random.multinomial(n=10**9, pvals=p), axis=1, arr=pvals) pearson_corr_mat = abs(np.corrcoef(samp_table.T))