def get_best_cart_classifier(train, label_col, skf_tune): cart = CARTConstructor() max_depths = np.arange(1, 21, 2) max_depths = np.append(max_depths, None) min_samples_splits = np.arange(1, 20, 1) errors = {} for max_depth in max_depths: for min_samples_split in min_samples_splits: errors[(max_depth, min_samples_split)] = [] for train_tune_idx, val_tune_idx in skf_tune: train_tune = train.iloc[train_tune_idx, :] X_train_tune = train_tune.drop(label_col, axis=1) y_train_tune = train_tune[label_col] val_tune = train.iloc[val_tune_idx, :] X_val_tune = val_tune.drop(label_col, axis=1) y_val_tune = val_tune[label_col] for max_depth in max_depths: for min_samples_split in min_samples_splits: cart.max_depth = max_depth cart.min_samples_split = min_samples_split tree = cart.construct_tree(X_train_tune, y_train_tune) predictions = tree.evaluate_multiple(X_val_tune).astype(int) errors[((max_depth, min_samples_split))].append( 1 - accuracy_score(predictions, y_val_tune, normalize=True)) for max_depth in max_depths: for min_samples_split in min_samples_splits: errors[(max_depth, min_samples_split)] = np.mean(errors[(max_depth, min_samples_split)]) best_params = min(errors.items(), key=operator.itemgetter(1))[0] cart.max_depth = best_params[0] cart.min_samples_split = best_params[1] return cart
feature_column_names = list(set(df.columns) - set(['disease'])) for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['disease'].copy() features_df = df.copy() features_df = features_df.drop('disease', axis=1) features_df = features_df / features_df.max() train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=0.01) cart = CARTConstructor(min_samples_leaf=10, max_depth=6) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=25, alpha=0.05) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} titles = ["C4.5", "Boosted C4.5", "Genetic"] for title in titles: tree_confusion_matrices[title] = [] skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED)
feature_maxs = {} feature_column_names = list(set(df.columns) - set(['class'])) for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['class'].copy() features_df = df.copy() features_df = features_df.drop('class', axis=1) train_labels_df = labels_df train_features_df = features_df c45 = C45Constructor(cf=0.95) cart = CARTConstructor(max_depth=12, min_samples_leaf=2) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=10, alpha=0.99) # c45 = C45Constructor(cf=0.75) # cart = CARTConstructor(max_depth=10, min_samples_leaf=2) # quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=10, alpha=0.9) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} for tree_constructor in tree_constructors: tree_confusion_matrices[tree_constructor.get_name()] = [] tree_confusion_matrices["Genetic"] = [] skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'],
# # clusters = fclusterdata(data_df[["meanIntensity", "meanDuration"]], 0.1, criterion="distance") # print clusters label_df = DataFrame() label_df["cat"] = features_df["diagnosis"] features_df = features_df.drop("diagnosis", axis=1) features_df = features_df.drop("id", axis=1) best_features_boruta = boruta_py_feature_selection(features_df.values, label_df['cat'].tolist(), columns, verbose=True, percentile=80, alpha=0.1) num_features_boruta = len(best_features_boruta) new_features_rf = DataFrame() new_features_boruta = DataFrame() for k in range(num_features_boruta): new_features_boruta[columns[best_features_boruta[k]]] = features_df[ columns[best_features_boruta[k]]] features_df_boruta = new_features_boruta cart = CARTConstructor(min_samples_split=1) tree = cart.construct_tree(new_features_boruta, labels=label_df) tree.visualise("./test.pdf")
# data_df = data_df.dropna() # # clusters = fclusterdata(data_df[["meanIntensity", "meanDuration"]], 0.1, criterion="distance") # print clusters label_df = DataFrame() label_df["cat"] = features_df["diagnosis"] features_df = features_df.drop("diagnosis", axis=1) features_df = features_df.drop("id", axis=1) best_features_boruta = boruta_py_feature_selection(features_df.values, label_df['cat'].tolist(), columns, verbose=True, percentile=80, alpha=0.1) num_features_boruta = len(best_features_boruta) new_features_rf = DataFrame() new_features_boruta = DataFrame() for k in range(num_features_boruta): new_features_boruta[columns[best_features_boruta[k]]] = features_df[columns[best_features_boruta[k]]] features_df_boruta = new_features_boruta cart = CARTConstructor(min_samples_split=1) tree = cart.construct_tree(new_features_boruta, labels=label_df) tree.visualise("./test.pdf")
best_features = RF_feature_selection(features_df.values, labels_df['cat'].tolist(), feature_column_names, verbose=True) new_features = DataFrame() for k in range(num_features): new_features[feature_column_names[best_features[k]]] = features_df[feature_column_names[best_features[k]]] features_df = new_features feature_column_names = list(set(features_df.columns) - set(['Survived'])) feature_mins = {} feature_maxs = {} for feature in feature_column_names: feature_mins[feature] = np.min(df[feature]) feature_maxs[feature] = np.max(df[feature]) c45 = C45Constructor(cf=0.15) cart = CARTConstructor(max_depth=3, min_samples_split=3) quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=10, alpha=0.1) tree_constructors = [c45, cart, quest] merger = DecisionTreeMerger() train_df = features_df.copy() train_df['cat'] = labels_df['cat'].copy() best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=10, num_mutations=5, population_size=10, max_samples=1, val_fraction=0.25, num_boosts=7) # best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=1, # num_mutations=1, population_size=1, max_samples=1, val_fraction=0.05) c45_tree = c45.construct_tree(features_df, labels_df) quest_tree = quest.construct_tree(features_df, labels_df) c45_tree.populate_samples(features_df, labels_df['cat']) quest_tree.populate_samples(features_df, labels_df['cat'])
# train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy() # train_features_df = train_features_df.reset_index(drop=True) # test_features_df = test_features_df.reset_index(drop=True) # train_labels_df = train_labels_df.reset_index(drop=True) # test_labels_df = test_labels_df.reset_index(drop=True) # train_features_df = features_df.head(int(0.8*len(features_df.index))) # test_features_df = features_df.tail(int(0.2*len(features_df.index))) # train_labels_df = labels_df.head(int(0.8*len(labels_df.index))) # test_labels_df = labels_df.tail(int(0.2*len(labels_df.index))) train_df = train_features_df.copy() train_df['cat'] = train_labels_df['cat'].copy() c45 = C45Constructor(cf=0.15) cart = CARTConstructor(min_samples_leaf=10) #c45_2 = C45Constructor(cf=0.15) #c45_3 = C45Constructor(cf=0.75) quest = QuestConstructor(default=1, max_nr_nodes=5, discrete_thresh=10, alpha=0.1) tree_constructors = [c45, cart, quest] trees = [] for tree_constructor in tree_constructors: tree = tree_constructor.construct_tree(train_features_df, train_labels_df) tree.visualise( os.path.join(os.path.join('..', 'data'), tree_constructor.get_name())) trees.append(tree) merger = DecisionTreeMerger()