for tree in trees: predicted_labels_set.append(tree.evaluate_multiple(test_features_df)) predicted_labels = [] for i in range(len(predicted_labels_set[0])): labels = [] for j in range(len(predicted_labels_set)): labels.append(predicted_labels_set[j][i]) predicted_labels.append(np.argmax(np.bincount(labels))) predicted_labels = np.asarray(predicted_labels) tree_confusion_matrices["Boosted C4.5"].append( tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str), predicted_labels.astype(str))) merger = DecisionTreeMerger() best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=10, num_mutations=5, population_size=10, max_samples=8, val_fraction=0.2, num_boosts=3) predicted_labels = best_tree.evaluate_multiple(test_features_df) tree_confusion_matrices["Genetic"].append( best_tree.plot_confusion_matrix( test_labels_df['cat'].values.astype(str), predicted_labels.astype(str)))
train_df = train_features_df.copy() train_df['cat'] = train_labels_df['cat'].copy() c45 = C45Constructor(cf=0.15) cart = CARTConstructor(min_samples_leaf=10) #c45_2 = C45Constructor(cf=0.15) #c45_3 = C45Constructor(cf=0.75) quest = QuestConstructor(default=1, max_nr_nodes=5, discrete_thresh=10, alpha=0.1) tree_constructors = [c45, cart, quest] trees = [] for tree_constructor in tree_constructors: tree = tree_constructor.construct_tree(train_features_df, train_labels_df) tree.visualise(os.path.join(os.path.join('..', 'data'), tree_constructor.get_name())) trees.append(tree) merger = DecisionTreeMerger() best_tree, constructed_trees = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=2) best_tree.visualise(os.path.join(os.path.join('..', 'data'), 'best_tree')) trees.append(best_tree) # trees.extend(constructed_trees) columns = ['PassengerId','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'] df = read_csv(os.path.join(os.path.join('..', 'data'), 'titanic_test.csv'), sep=',') df.columns = columns test_features_df = df[['PassengerId', 'Pclass', 'Sex', 'Parch', 'Age', 'Name', 'Ticket', 'SibSp', 'Fare', 'Embarked']].copy() test_features_df['Sex'] = test_features_df['Sex'].map(mapping_sex) test_features_df['Embarked'] = test_features_df['Embarked'].map(mapping_embarked) test_features_df['Title'] = test_features_df['Name'].map(lambda x: get_title(x)) test_features_df = test_features_df.drop('Name', axis=1) test_features_df['Title'] = test_features_df['Title'].map(mapping_title)
for train_index, test_index in skf: train_features_df, test_features_df = features_df.iloc[ train_index, :].copy(), features_df.iloc[test_index, :].copy() train_labels_df, test_labels_df = labels_df.iloc[ train_index, :].copy(), labels_df.iloc[test_index, :].copy() train_features_df = train_features_df.reset_index(drop=True) test_features_df = test_features_df.reset_index(drop=True) train_labels_df = train_labels_df.reset_index(drop=True) test_labels_df = test_labels_df.reset_index(drop=True) train_df = train_features_df.copy() train_df['cat'] = train_labels_df['cat'].copy() tree = c45.construct_tree(train_features_df, train_labels_df) tree.populate_samples(train_features_df, train_labels_df['cat']) # tree.visualise('c45_unaugmented') merger = DecisionTreeMerger() regions = merger.decision_tree_to_decision_table(tree, train_features_df) for region in regions: for feature in feature_column_names: if region[feature][0] == float("-inf"): region[feature][0] = feature_mins[feature] if region[feature][1] == float("inf"): region[feature][1] = feature_maxs[feature] new_df = merger.generate_samples(regions, features_df.columns, descriptors) sample_labels_df = new_df[['cat']].copy() augmented_labels_df = sample_labels_df.append(train_labels_df, ignore_index=True) new_df = new_df.drop('cat', axis=1) augmented_features_df = new_df.append(train_features_df, ignore_index=True) augmented_features_df = augmented_features_df.astype(float) augmented_tree = c45.construct_tree(augmented_features_df,
train_df['cat'] = train_labels_df['cat'].copy() trees = [] for tree_constructor in tree_constructors: tree = tree_constructor.construct_tree(train_features_df, train_labels_df) #tree.visualise(os.path.join(os.path.join('..', 'data'), tree_constructor.get_name())) trees.append(tree) predicted_labels = tree.evaluate_multiple(test_features_df) tree_confusion_matrices[tree_constructor.get_name()].append(tree.plot_confusion_matrix(test_labels_df['cat'] .values.astype(str), predicted_labels.astype(str))) print tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str), predicted_labels.astype(str)) merger = DecisionTreeMerger() #important params: max_samples, val_fraction and num_boosts best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=5, num_mutations=3, population_size=6, max_samples=2, val_fraction=0.15, num_boosts=2) #best_tree.visualise(os.path.join(os.path.join('..', 'data'), 'best_tree')) predicted_labels = best_tree.evaluate_multiple(test_features_df) tree_confusion_matrices["Genetic"].append(best_tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str), predicted_labels.astype(str))) print best_tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str), predicted_labels.astype(str)) #raw_input("Press Enter to continue...") tree_confusion_matrices_mean = {} for key in tree_confusion_matrices: print key for matrix in tree_confusion_matrices[key]:
skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=2, shuffle=True, random_state=SEED) for train_index, test_index in skf: train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy() train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy() train_features_df = train_features_df.reset_index(drop=True) test_features_df = test_features_df.reset_index(drop=True) train_labels_df = train_labels_df.reset_index(drop=True) test_labels_df = test_labels_df.reset_index(drop=True) train_df = train_features_df.copy() train_df['cat'] = train_labels_df['cat'].copy() tree = c45.construct_tree(train_features_df, train_labels_df) tree.populate_samples(train_features_df, train_labels_df['cat']) # tree.visualise('c45_unaugmented') merger = DecisionTreeMerger() regions = merger.decision_tree_to_decision_table(tree, train_features_df) for region in regions: for feature in feature_column_names: if region[feature][0] == float("-inf"): region[feature][0] = feature_mins[feature] if region[feature][1] == float("inf"): region[feature][1] = feature_maxs[feature] new_df = merger.generate_samples(regions, features_df.columns, descriptors) sample_labels_df = new_df[['cat']].copy() augmented_labels_df = sample_labels_df.append(train_labels_df, ignore_index=True) new_df = new_df.drop('cat', axis=1) augmented_features_df = new_df.append(train_features_df, ignore_index=True) augmented_features_df = augmented_features_df.astype(float) augmented_tree = c45.construct_tree(augmented_features_df, augmented_labels_df) augmented_tree.populate_samples(train_features_df, train_labels_df['cat'])
features_df = new_features feature_column_names = list(set(features_df.columns) - set(['Survived'])) feature_mins = {} feature_maxs = {} for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) c45 = C45Constructor(cf=0.15) cart = CARTConstructor(max_depth=3, min_samples_split=3) quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=10, alpha=0.1) tree_constructors = [c45, cart, quest] merger = DecisionTreeMerger() train_df = features_df.copy() train_df['cat'] = labels_df['cat'].copy() best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=10, num_mutations=5, population_size=10, max_samples=1, val_fraction=0.25, num_boosts=7) # best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=1, # num_mutations=1, population_size=1, max_samples=1, val_fraction=0.05) c45_tree = c45.construct_tree(features_df, labels_df) quest_tree = quest.construct_tree(features_df, labels_df) c45_tree.populate_samples(features_df, labels_df['cat']) quest_tree.populate_samples(features_df, labels_df['cat']) cart_tree = cart.construct_tree(features_df, labels_df) best_tree.visualise('best_tree') c45_tree.visualise('c45') cart_tree.visualise('cart')
'resting electrocardio', 'max heartrate', 'exercise induced angina', 'oldpeak', 'slope peak', \ 'number of vessels', 'thal', 'disease'] df = read_csv(os.path.join(os.path.join('..', 'data'), 'heart.dat'), sep=' ') df.columns=columns df=df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['disease'].copy() features_df = df.copy() features_df = features_df.drop('disease', axis=1) train_labels_df = labels_df train_features_df = features_df[['max heartrate', 'resting blood pressure']] feature_mins = {} feature_maxs = {} feature_column_names = ['max heartrate', 'resting blood pressure'] for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) merger = DecisionTreeMerger() # cart = CARTConstructor(min_samples_leaf=10, max_depth=2) cart = C45Constructor(cf=1.0) tree = cart.construct_tree(train_features_df, train_labels_df) tree.populate_samples(train_features_df, train_labels_df['cat']) tree.visualise("2d_tree") regions = merger.decision_tree_to_decision_table(tree, train_features_df) print regions merger.plot_regions("2d_regions", regions, ['1', '2'], "max heartrate", "resting blood pressure", y_max=feature_maxs["resting blood pressure"], x_max=feature_maxs["max heartrate"], y_min=feature_mins["resting blood pressure"], x_min=feature_mins["max heartrate"])
import time from sklearn.cross_validation import StratifiedShuffleSplit from constructors.c45orangeconstructor import C45Constructor from constructors.treemerger import DecisionTreeMerger from constructors.treemerger_clean import DecisionTreeMergerClean from data.load_datasets import load_led7 import numpy as np merger = DecisionTreeMergerClean() merger2 = DecisionTreeMerger() df, features, label, name = load_led7() c45 = C45Constructor() skf = StratifiedShuffleSplit(df[label], 1, test_size=0.5, random_state=1337) feature_mins = {} feature_maxs = {} for feature in features: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) for fold, (train_idx, test_idx) in enumerate(skf): # print 'Fold', fold+1, '/', NR_FOLDS train = df.iloc[train_idx, :].reset_index(drop=True) X_train = train.drop(label, axis=1) y_train = train[label] test = df.iloc[test_idx, :].reset_index(drop=True) X_test = test.drop(label, axis=1) y_test = test[label]
new_features[feature_column_names[best_features[k]]] = features_df[feature_column_names[best_features[k]]] features_df = new_features feature_column_names = list(set(df.columns) - set(['disease'])) c45 = C45Constructor(cf=0.01) cart = CARTConstructor(min_samples_leaf=10, max_depth=6) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=25, alpha=0.05) tree_constructors = [c45, cart, quest] # tree_confusion_matrices = {} for tree_constructor in tree_constructors: tree_confusion_matrices[tree_constructor.get_name() + " Tree"] = [] tree_confusion_matrices[tree_constructor.get_name() + " Regions"] = [] tree_confusion_matrices["Genetic Regions"] = [] tree_confusion_matrices["Genetic Tree"] = [] merger = DecisionTreeMerger() skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED) for train_index, test_index in skf: train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy() train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy() train_features_df = train_features_df.reset_index(drop=True) test_features_df = test_features_df.reset_index(drop=True) train_labels_df = train_labels_df.reset_index(drop=True) test_labels_df = test_labels_df.reset_index(drop=True) train_df = train_features_df.copy() train_df['cat'] = train_labels_df['cat'].copy() trees = []
cart = CARTConstructor(min_samples_leaf=10) #c45_2 = C45Constructor(cf=0.15) #c45_3 = C45Constructor(cf=0.75) quest = QuestConstructor(default=1, max_nr_nodes=5, discrete_thresh=10, alpha=0.1) tree_constructors = [c45, cart, quest] trees = [] for tree_constructor in tree_constructors: tree = tree_constructor.construct_tree(train_features_df, train_labels_df) tree.visualise( os.path.join(os.path.join('..', 'data'), tree_constructor.get_name())) trees.append(tree) merger = DecisionTreeMerger() best_tree, constructed_trees = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=2) best_tree.visualise(os.path.join(os.path.join('..', 'data'), 'best_tree')) trees.append(best_tree) # trees.extend(constructed_trees) columns = [ 'PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked' ] df = read_csv(os.path.join(os.path.join('..', 'data'), 'titanic_test.csv'), sep=',')
from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.metrics import confusion_matrix from constructors.c45orangeconstructor import C45Constructor from constructors.treemerger import DecisionTreeMerger from constructors.treemerger_clean import DecisionTreeMergerClean from data.load_datasets import load_austra from sklearn import preprocessing import numpy as np import pandas as pd import collections import operator merger = DecisionTreeMergerClean() merger2 = DecisionTreeMerger() df, features, label, name = load_austra() c45 = C45Constructor(cf=0.0) skf = StratifiedShuffleSplit(df[label], 1, test_size=0.25, random_state=1337) feature_mins = {} feature_maxs = {} for feature in features: feature_mins[feature] = 0.0 feature_maxs[feature] = 1.0 for fold, (train_idx, test_idx) in enumerate(skf): # print 'Fold', fold+1, '/', NR_FOLDS train = df.iloc[train_idx, :].reset_index(drop=True) X_train = train.drop(label, axis=1)
df.columns = columns df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['disease'].copy() features_df = df.copy() features_df = features_df.drop('disease', axis=1) train_labels_df = labels_df train_features_df = features_df[['max heartrate', 'resting blood pressure']] feature_mins = {} feature_maxs = {} feature_column_names = ['max heartrate', 'resting blood pressure'] for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) merger = DecisionTreeMerger() # cart = CARTConstructor(min_samples_leaf=10, max_depth=2) cart = C45Constructor(cf=1.0) tree = cart.construct_tree(train_features_df, train_labels_df) tree.populate_samples(train_features_df, train_labels_df['cat']) tree.visualise("2d_tree") regions = merger.decision_tree_to_decision_table(tree, train_features_df) print regions merger.plot_regions("2d_regions", regions, ['1', '2'], "max heartrate", "resting blood pressure", y_max=feature_maxs["resting blood pressure"], x_max=feature_maxs["max heartrate"], y_min=feature_mins["resting blood pressure"],