def run_bs_adaboost(): df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df = prc.standarize(df) # or normalize dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight = {1: 20, 0:1}), n_estimators=20) print(main(df, "AdaBoost", dt, bs_estimate = True, verbose=True))
def run_bs_adaboost(): df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier( prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df = prc.standarize(df) # or normalize dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight={ 1: 20, 0: 1 }), n_estimators=20) print(main(df, "AdaBoost", dt, bs_estimate=True, verbose=True)) # run_depth_test() # run_bs_dt() #run_bs_adaboost() # Test meta learning example #abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100) #main(df=df, name = "AdaBoost Decision Stumps", model=abc) # Print PR Curves from test #plt.legend(loc=1) #plt.title("Precision Recall Curve") #plt.show()
def run_depth_test(): df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df = prc.standarize(df) # or normalize rslt = test_tree_depth(df) print("Run Time: " + str(datetime.now() - startTime)) # Print PR Curves from test plt.legend(loc=1) plt.title("Precision Recall Curve") plt.show() # Print out the distribution of curves plt.plot(list(range(2, len(rslt))), rslt[2:]) plt.ylabel("Depth of Tree") plt.xlabel("Pr@Re>50") plt.title("Testing Decision Tree Depth") plt.xticks(list(range(2, len(rslt)))) plt.show()
import numpy as np import pandas as pd from sklearn.datasets import fetch_mldata from sklearn.decomposition import PCA from sklearn.manifold import TSNE import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import seaborn as sns import preprocessing as prc import feature_selection as fs df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df_norm = prc.normalize(df) #normalize features = df_norm.iloc[:, :-1] target = df_norm.iloc[:, -1] from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif f_anova = fs.select_k_best(features, target, f_classif, 2) df = df_norm[f_anova.iloc[:, 0].append(pd.Series('class'))] # #sns.set() #plt.title("Distribution of Feature 15") #sns.distplot(df['Pb_NO_sideR35_S']) #plt.show() #plt.figure(figsize=(16,7))
df = fs.select_k_best_ANOVA(data, k=n_features) out = old_main.test_tree_depth(df, class_weight="balanced") summary_balance.append([data_str_name + '-ANOVA', i, out.index(max(out)), max(out)]) df = fs.RFECV_DT(data, min_features_to_select=n_features, max_depth=max_dapth) out = old_main.test_tree_depth(df, class_weight="balanced") summary_balance.append([data_str_name + '-RFECV', i, out.index(max(out)), max(out)]) return summary_balance summary_balance = [] df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df)) df = prc.standarize(df) # or normalize # ============================================================================= # Unsupervised optimal feature selection | optimal tree depth # ============================================================================= vt = fs.variance_threshold(df, threshold=1) rslt_vt = main.test_tree_depth(vt, class_weight="balanced") summary_balance.append(['variance-threshold', rslt_vt.index(max(rslt_vt)), max(rslt_vt)]) pca_2 = fs.pca_linear(df, n=2) # n_c9 is 9, based VarianceThreshold results, axis to gain most information rslt_pca = main.test_tree_depth(pca_2, class_weight="balanced") summary_balance.append(['pca-2', rslt_pca.index(max(rslt_pca)), max(rslt_pca)]) pca_7 = fs.pca_linear(df, n=7) # n_c9 is 9, based VarianceThreshold results, axis to gain most information