def get_best_c45_classifier(train, label_col, skf_tune): c45 = C45Constructor() cfs = np.arange(0.05, 1.05, 0.05) cfs_errors = {} for cf in cfs: cfs_errors[cf] = [] for train_tune_idx, val_tune_idx in skf_tune: train_tune = train.iloc[train_tune_idx, :] X_train_tune = train_tune.drop(label_col, axis=1) y_train_tune = train_tune[label_col] val_tune = train.iloc[val_tune_idx, :] X_val_tune = val_tune.drop(label_col, axis=1) y_val_tune = val_tune[label_col] for cf in cfs: c45.cf = cf tree = c45.construct_tree(X_train_tune, y_train_tune) predictions = tree.evaluate_multiple(X_val_tune).astype(int) cfs_errors[cf].append( 1 - accuracy_score(predictions, y_val_tune, normalize=True)) for cf in cfs: cfs_errors[cf] = np.mean(cfs_errors[cf]) c45.cf = min(cfs_errors.items(), key=operator.itemgetter(1))[0] return c45
feature_column_names[best_features_rf[k]]] for k in range(num_features_boruta): new_features_boruta[feature_column_names[best_features_boruta[ k]]] = features_df[feature_column_names[best_features_boruta[k]]] confusion_matrices = {} confusion_matrices['RF'] = [] confusion_matrices['Boruta'] = [] features_df_rf = new_features_rf features_df_boruta = new_features_boruta feature_column_names_rf = list(set(features_df_rf.columns) - set(['cat'])) feature_column_names_boruta = list( set(features_df_boruta.columns) - set(['cat'])) c45 = C45Constructor(cf=0.15) skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED) for train_index, test_index in skf: train_features_df_rf, test_features_df_rf = features_df_rf.iloc[ train_index, :].copy(), features_df_rf.iloc[test_index, :].copy() train_features_df_boruta, test_features_df_boruta = features_df_boruta.iloc[ train_index, :].copy(), features_df_boruta.iloc[test_index, :].copy() train_labels_df, test_labels_df = labels_df.iloc[ train_index, :].copy(), labels_df.iloc[test_index, :].copy() train_features_df_rf = train_features_df_rf.reset_index(drop=True)
feature_maxs = {} feature_column_names = list(set(df.columns) - set(['disease'])) for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['disease'].copy() features_df = df.copy() features_df = features_df.drop('disease', axis=1) features_df = features_df / features_df.max() train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=0.01) cart = CARTConstructor(min_samples_leaf=10, max_depth=6) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=25, alpha=0.05) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} titles = ["C4.5", "Boosted C4.5", "Genetic"] for title in titles: tree_confusion_matrices[title] = [] skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True,
feature_mins = {} feature_maxs = {} feature_column_names = list(set(df.columns) - set(['class'])) for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['class'].copy() features_df = df.copy() features_df = features_df.drop('class', axis=1) train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=0.95) cart = CARTConstructor(max_depth=12, min_samples_leaf=2) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=10, alpha=0.99) # c45 = C45Constructor(cf=0.75) # cart = CARTConstructor(max_depth=10, min_samples_leaf=2) # quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=10, alpha=0.9) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} for tree_constructor in tree_constructors: tree_confusion_matrices[tree_constructor.get_name()] = [] tree_confusion_matrices["Genetic"] = []
feature_mins = {} feature_maxs = {} feature_column_names = list(set(df.columns) - set(['Name'])) for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df=df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['Name'].copy() features_df = df.copy() features_df = features_df.drop('Name', axis=1) train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=1.0) cart = CARTConstructor(max_depth=5, min_samples_leaf=2) quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=1, alpha=0.0000001) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} for tree_constructor in tree_constructors: tree_confusion_matrices[tree_constructor.get_name()] = [] tree_confusion_matrices["Genetic"] = [] skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED) for train_index, test_index in skf: train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy() train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy() train_features_df = train_features_df.reset_index(drop=True)
feature_maxs = {} feature_column_names = list(set(df.columns) - set(['Name'])) for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df=df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['Name'].copy() features_df = df.copy() features_df = features_df.drop('Name', axis=1) features_df = features_df/features_df.max() train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=0.75) tree_confusion_matrices = {} titles = ["Unaugmented C4.5", "Augmented C4.5"] skf = StratifiedKFold(labels_df['cat'], n_folds=5, shuffle=True, random_state=SEED) for train_index, test_index in skf: train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy() train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy() train_features_df = train_features_df.reset_index(drop=True) test_features_df = test_features_df.reset_index(drop=True) train_labels_df = train_labels_df.reset_index(drop=True) test_labels_df = test_labels_df.reset_index(drop=True) train_df = train_features_df.copy() train_df['cat'] = train_labels_df['cat'].copy()
df[feature] += np.min(df[feature]) * (-1) feature_mins[feature] = 0 else: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df=df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['class'].copy() features_df = df.copy() features_df = features_df.drop('class', axis=1) train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=0.65) cart = CARTConstructor(min_samples_leaf=5, max_depth=6) quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=1, alpha=0.25) tree_constructors = [c45, cart, quest] # tree_constructors = [quest] tree_confusion_matrices = {} for tree_constructor in tree_constructors: tree_confusion_matrices[tree_constructor.get_name()] = [] tree_confusion_matrices["Genetic"] = [] skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=False, random_state=SEED) for train_index, test_index in skf: train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy() train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
import time from sklearn.cross_validation import StratifiedShuffleSplit from constructors.c45orangeconstructor import C45Constructor from constructors.treemerger import DecisionTreeMerger from constructors.treemerger_clean import DecisionTreeMergerClean from data.load_datasets import load_led7 import numpy as np merger = DecisionTreeMergerClean() merger2 = DecisionTreeMerger() df, features, label, name = load_led7() c45 = C45Constructor() skf = StratifiedShuffleSplit(df[label], 1, test_size=0.5, random_state=1337) feature_mins = {} feature_maxs = {} for feature in features: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) for fold, (train_idx, test_idx) in enumerate(skf): # print 'Fold', fold+1, '/', NR_FOLDS train = df.iloc[train_idx, :].reset_index(drop=True) X_train = train.drop(label, axis=1) y_train = train[label] test = df.iloc[test_idx, :].reset_index(drop=True) X_test = test.drop(label, axis=1) y_test = test[label]
from constructors.c45orangeconstructor import C45Constructor from constructors.treemerger import DecisionTreeMerger from constructors.treemerger_clean import DecisionTreeMergerClean from data.load_datasets import load_austra from sklearn import preprocessing import numpy as np import pandas as pd import collections import operator merger = DecisionTreeMergerClean() merger2 = DecisionTreeMerger() df, features, label, name = load_austra() c45 = C45Constructor(cf=0.0) skf = StratifiedShuffleSplit(df[label], 1, test_size=0.25, random_state=1337) feature_mins = {} feature_maxs = {} for feature in features: feature_mins[feature] = 0.0 feature_maxs[feature] = 1.0 for fold, (train_idx, test_idx) in enumerate(skf): # print 'Fold', fold+1, '/', NR_FOLDS train = df.iloc[train_idx, :].reset_index(drop=True) X_train = train.drop(label, axis=1) x = X_train.values #returns a numpy array min_max_scaler = preprocessing.MinMaxScaler()
df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['disease'].copy() features_df = df.copy() features_df = features_df.drop('disease', axis=1) train_labels_df = labels_df train_features_df = features_df[['max heartrate', 'resting blood pressure']] feature_mins = {} feature_maxs = {} feature_column_names = ['max heartrate', 'resting blood pressure'] for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) merger = DecisionTreeMerger() # cart = CARTConstructor(min_samples_leaf=10, max_depth=2) cart = C45Constructor(cf=1.0) tree = cart.construct_tree(train_features_df, train_labels_df) tree.populate_samples(train_features_df, train_labels_df['cat']) tree.visualise("2d_tree") regions = merger.decision_tree_to_decision_table(tree, train_features_df) print regions merger.plot_regions("2d_regions", regions, ['1', '2'], "max heartrate", "resting blood pressure", y_max=feature_maxs["resting blood pressure"], x_max=feature_maxs["max heartrate"], y_min=feature_mins["resting blood pressure"], x_min=feature_mins["max heartrate"])