def get_best_quest_classifier(train, label_col, skf_tune): quest = QuestConstructor() alphas = [10e-5, 10e-4, 10e-3, 10e-2, 0.25, 0.5, 0.9, 0.99] #np.arange(0.001, 1, 0.01) # max_nr_nodes = np.arange(1,20,2) errors = {} for alpha in alphas: # for max_nr_node in max_nr_nodes: errors[alpha] = [] for train_tune_idx, val_tune_idx in skf_tune: train_tune = train.iloc[train_tune_idx, :] X_train_tune = train_tune.drop(label_col, axis=1) y_train_tune = train_tune[label_col] val_tune = train.iloc[val_tune_idx, :] X_val_tune = val_tune.drop(label_col, axis=1) y_val_tune = val_tune[label_col] for alpha in alphas: quest.alpha = alpha tree = quest.construct_tree(X_train_tune, y_train_tune) predictions = tree.evaluate_multiple(X_val_tune).astype(int) errors[alpha].append( 1 - accuracy_score(predictions, y_val_tune, normalize=True)) for alpha in alphas: # for max_nr_node in max_nr_nodes: errors[alpha] = np.mean(errors[alpha]) best_params = min(errors.items(), key=operator.itemgetter(1))[0] quest.alpha = best_params return quest
for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['disease'].copy() features_df = df.copy() features_df = features_df.drop('disease', axis=1) features_df = features_df / features_df.max() train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=0.01) cart = CARTConstructor(min_samples_leaf=10, max_depth=6) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=25, alpha=0.05) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} titles = ["C4.5", "Boosted C4.5", "Genetic"] for title in titles: tree_confusion_matrices[title] = [] skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED) for train_index, test_index in skf: trees = []
feature_column_names = list(set(df.columns) - set(['Name'])) for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df=df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['Name'].copy() features_df = df.copy() features_df = features_df.drop('Name', axis=1) train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=1.0) cart = CARTConstructor(max_depth=5, min_samples_leaf=2) quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=1, alpha=0.0000001) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} for tree_constructor in tree_constructors: tree_confusion_matrices[tree_constructor.get_name()] = [] tree_confusion_matrices["Genetic"] = [] skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED) for train_index, test_index in skf: train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy() train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy() train_features_df = train_features_df.reset_index(drop=True) test_features_df = test_features_df.reset_index(drop=True) train_labels_df = train_labels_df.reset_index(drop=True)
new_features = DataFrame() for k in range(num_features): new_features[feature_column_names[best_features[k]]] = features_df[feature_column_names[best_features[k]]] features_df = new_features feature_column_names = list(set(features_df.columns) - set(['Survived'])) feature_mins = {} feature_maxs = {} for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) c45 = C45Constructor(cf=0.15) cart = CARTConstructor(max_depth=3, min_samples_split=3) quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=10, alpha=0.1) tree_constructors = [c45, cart, quest] merger = DecisionTreeMerger() train_df = features_df.copy() train_df['cat'] = labels_df['cat'].copy() best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=10, num_mutations=5, population_size=10, max_samples=1, val_fraction=0.25, num_boosts=7) # best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=1, # num_mutations=1, population_size=1, max_samples=1, val_fraction=0.05) c45_tree = c45.construct_tree(features_df, labels_df) quest_tree = quest.construct_tree(features_df, labels_df) c45_tree.populate_samples(features_df, labels_df['cat']) quest_tree.populate_samples(features_df, labels_df['cat']) cart_tree = cart.construct_tree(features_df, labels_df)
# test_labels_df = test_labels_df.reset_index(drop=True) # train_features_df = features_df.head(int(0.8*len(features_df.index))) # test_features_df = features_df.tail(int(0.2*len(features_df.index))) # train_labels_df = labels_df.head(int(0.8*len(labels_df.index))) # test_labels_df = labels_df.tail(int(0.2*len(labels_df.index))) train_df = train_features_df.copy() train_df['cat'] = train_labels_df['cat'].copy() c45 = C45Constructor(cf=0.15) cart = CARTConstructor(min_samples_leaf=10) #c45_2 = C45Constructor(cf=0.15) #c45_3 = C45Constructor(cf=0.75) quest = QuestConstructor(default=1, max_nr_nodes=5, discrete_thresh=10, alpha=0.1) tree_constructors = [c45, cart, quest] trees = [] for tree_constructor in tree_constructors: tree = tree_constructor.construct_tree(train_features_df, train_labels_df) tree.visualise( os.path.join(os.path.join('..', 'data'), tree_constructor.get_name())) trees.append(tree) merger = DecisionTreeMerger() best_tree, constructed_trees = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=2)
measurements_temp[dataset][algorithm] = measurements[dataset][algorithm] measurements_list.append(measurements_temp) target.write(write_preamble()) for measurements_ in measurements_list: target.write(write_measurements(measurements_)) target.write(write_figures(figure)) target.write(write_footing()) target.close() datasets = load_all_datasets() quest_bench = QUESTBenchConstructor() guide = GUIDEConstructor() quest = QuestConstructor() inTrees = inTreesClassifier() merger = DecisionTreeMergerClean() NR_FOLDS = 3 for dataset in datasets: print dataset['name'], len(dataset['dataframe']) conf_matrices = {'QUESTGilles': [], 'GUIDE': [], 'C4.5': [], 'CART': [], 'ISM': [], 'ISM_pruned': [], 'Genetic': [], 'CN2': [], 'QUESTLoh': [], 'inTrees': [], 'XGBoost': [], 'RF': []} # avg_nodes = {'QUESTGilles': [], 'GUIDE': [], 'C4.5': [], 'CART': [], 'ISM': [], 'ISM_pruned': [], 'Genetic': [], 'CN2': [], 'QUESTLoh': [], 'inTrees': [], 'XGBoost': [], 'RF': []} # times = {'QUESTGilles': [], 'GUIDE': [], 'C4.5': [], 'CART': [], 'ISM': [], 'ISM_pruned': [], 'Genetic': [], 'CN2': [], 'QUESTLoh': [], 'inTrees': [], 'XGBoost': [], 'RF': []} # df = dataset['dataframe'] label_col = dataset['label_col'] feature_cols = dataset['feature_cols'] skf = StratifiedKFold(df[label_col], n_folds=NR_FOLDS, shuffle=True, random_state=None)