for tree in trees:
        predicted_labels_set.append(tree.evaluate_multiple(test_features_df))

    predicted_labels = []
    for i in range(len(predicted_labels_set[0])):
        labels = []
        for j in range(len(predicted_labels_set)):
            labels.append(predicted_labels_set[j][i])
        predicted_labels.append(np.argmax(np.bincount(labels)))

    predicted_labels = np.asarray(predicted_labels)
    tree_confusion_matrices["Boosted C4.5"].append(
        tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str),
                                   predicted_labels.astype(str)))

    merger = DecisionTreeMerger()
    best_tree = merger.genetic_algorithm(train_df,
                                         'cat',
                                         tree_constructors,
                                         seed=SEED,
                                         num_iterations=10,
                                         num_mutations=5,
                                         population_size=10,
                                         max_samples=8,
                                         val_fraction=0.2,
                                         num_boosts=3)
    predicted_labels = best_tree.evaluate_multiple(test_features_df)
    tree_confusion_matrices["Genetic"].append(
        best_tree.plot_confusion_matrix(
            test_labels_df['cat'].values.astype(str),
            predicted_labels.astype(str)))
train_df = train_features_df.copy()
train_df['cat'] = train_labels_df['cat'].copy()

c45 = C45Constructor(cf=0.15)
cart = CARTConstructor(min_samples_leaf=10)
#c45_2 = C45Constructor(cf=0.15)
#c45_3 = C45Constructor(cf=0.75)
quest = QuestConstructor(default=1, max_nr_nodes=5, discrete_thresh=10, alpha=0.1)
tree_constructors = [c45, cart, quest]
trees = []
for tree_constructor in tree_constructors:
    tree = tree_constructor.construct_tree(train_features_df, train_labels_df)
    tree.visualise(os.path.join(os.path.join('..', 'data'), tree_constructor.get_name()))
    trees.append(tree)

merger = DecisionTreeMerger()
best_tree, constructed_trees = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=2)
best_tree.visualise(os.path.join(os.path.join('..', 'data'), 'best_tree'))
trees.append(best_tree)
# trees.extend(constructed_trees)

columns = ['PassengerId','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
df = read_csv(os.path.join(os.path.join('..', 'data'), 'titanic_test.csv'), sep=',')
df.columns = columns

test_features_df = df[['PassengerId', 'Pclass', 'Sex', 'Parch', 'Age', 'Name', 'Ticket', 'SibSp', 'Fare', 'Embarked']].copy()
test_features_df['Sex'] = test_features_df['Sex'].map(mapping_sex)
test_features_df['Embarked'] = test_features_df['Embarked'].map(mapping_embarked)
test_features_df['Title'] = test_features_df['Name'].map(lambda x: get_title(x))
test_features_df = test_features_df.drop('Name', axis=1)
test_features_df['Title'] = test_features_df['Title'].map(mapping_title)
for train_index, test_index in skf:
    train_features_df, test_features_df = features_df.iloc[
        train_index, :].copy(), features_df.iloc[test_index, :].copy()
    train_labels_df, test_labels_df = labels_df.iloc[
        train_index, :].copy(), labels_df.iloc[test_index, :].copy()
    train_features_df = train_features_df.reset_index(drop=True)
    test_features_df = test_features_df.reset_index(drop=True)
    train_labels_df = train_labels_df.reset_index(drop=True)
    test_labels_df = test_labels_df.reset_index(drop=True)
    train_df = train_features_df.copy()
    train_df['cat'] = train_labels_df['cat'].copy()
    tree = c45.construct_tree(train_features_df, train_labels_df)
    tree.populate_samples(train_features_df, train_labels_df['cat'])
    # tree.visualise('c45_unaugmented')
    merger = DecisionTreeMerger()
    regions = merger.decision_tree_to_decision_table(tree, train_features_df)
    for region in regions:
        for feature in feature_column_names:
            if region[feature][0] == float("-inf"):
                region[feature][0] = feature_mins[feature]
            if region[feature][1] == float("inf"):
                region[feature][1] = feature_maxs[feature]
    new_df = merger.generate_samples(regions, features_df.columns, descriptors)
    sample_labels_df = new_df[['cat']].copy()
    augmented_labels_df = sample_labels_df.append(train_labels_df,
                                                  ignore_index=True)
    new_df = new_df.drop('cat', axis=1)
    augmented_features_df = new_df.append(train_features_df, ignore_index=True)
    augmented_features_df = augmented_features_df.astype(float)
    augmented_tree = c45.construct_tree(augmented_features_df,
    train_df['cat'] = train_labels_df['cat'].copy()

    trees = []

    for tree_constructor in tree_constructors:
        tree = tree_constructor.construct_tree(train_features_df, train_labels_df)
        #tree.visualise(os.path.join(os.path.join('..', 'data'), tree_constructor.get_name()))
        trees.append(tree)
        predicted_labels = tree.evaluate_multiple(test_features_df)
        tree_confusion_matrices[tree_constructor.get_name()].append(tree.plot_confusion_matrix(test_labels_df['cat']
                                                                                               .values.astype(str),
                                                                    predicted_labels.astype(str)))
        print tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str), predicted_labels.astype(str))


    merger = DecisionTreeMerger()
    #important params: max_samples, val_fraction and num_boosts
    best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=5,
                                         num_mutations=3, population_size=6, max_samples=2, val_fraction=0.15,
                                         num_boosts=2)
    #best_tree.visualise(os.path.join(os.path.join('..', 'data'), 'best_tree'))
    predicted_labels = best_tree.evaluate_multiple(test_features_df)
    tree_confusion_matrices["Genetic"].append(best_tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str),
                                              predicted_labels.astype(str)))
    print best_tree.plot_confusion_matrix(test_labels_df['cat'].values.astype(str), predicted_labels.astype(str))
    #raw_input("Press Enter to continue...")

tree_confusion_matrices_mean = {}
for key in tree_confusion_matrices:
    print key
    for matrix in tree_confusion_matrices[key]:
skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=2, shuffle=True, random_state=SEED)

for train_index, test_index in skf:
    train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy()
    train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
    train_features_df = train_features_df.reset_index(drop=True)
    test_features_df = test_features_df.reset_index(drop=True)
    train_labels_df = train_labels_df.reset_index(drop=True)
    test_labels_df = test_labels_df.reset_index(drop=True)
    train_df = train_features_df.copy()
    train_df['cat'] = train_labels_df['cat'].copy()
    tree = c45.construct_tree(train_features_df, train_labels_df)
    tree.populate_samples(train_features_df, train_labels_df['cat'])
    # tree.visualise('c45_unaugmented')
    merger = DecisionTreeMerger()
    regions = merger.decision_tree_to_decision_table(tree, train_features_df)
    for region in regions:
        for feature in feature_column_names:
            if region[feature][0] == float("-inf"):
                region[feature][0] = feature_mins[feature]
            if region[feature][1] == float("inf"):
                region[feature][1] = feature_maxs[feature]
    new_df = merger.generate_samples(regions, features_df.columns, descriptors)
    sample_labels_df = new_df[['cat']].copy()
    augmented_labels_df = sample_labels_df.append(train_labels_df, ignore_index=True)
    new_df = new_df.drop('cat', axis=1)
    augmented_features_df = new_df.append(train_features_df, ignore_index=True)
    augmented_features_df = augmented_features_df.astype(float)
    augmented_tree = c45.construct_tree(augmented_features_df, augmented_labels_df)
    augmented_tree.populate_samples(train_features_df, train_labels_df['cat'])
features_df = new_features
feature_column_names = list(set(features_df.columns) - set(['Survived']))

feature_mins = {}
feature_maxs = {}

for feature in feature_column_names:
        feature_mins[feature] = np.min(df[feature])
        feature_maxs[feature] = np.max(df[feature])

c45 = C45Constructor(cf=0.15)
cart = CARTConstructor(max_depth=3, min_samples_split=3)
quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=10, alpha=0.1)
tree_constructors = [c45, cart, quest]

merger = DecisionTreeMerger()
train_df = features_df.copy()
train_df['cat'] = labels_df['cat'].copy()
best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=10,
                                     num_mutations=5, population_size=10, max_samples=1, val_fraction=0.25,
                                     num_boosts=7)
# best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=1,
#                                      num_mutations=1, population_size=1, max_samples=1, val_fraction=0.05)
c45_tree = c45.construct_tree(features_df, labels_df)
quest_tree = quest.construct_tree(features_df, labels_df)
c45_tree.populate_samples(features_df, labels_df['cat'])
quest_tree.populate_samples(features_df, labels_df['cat'])
cart_tree = cart.construct_tree(features_df, labels_df)
best_tree.visualise('best_tree')
c45_tree.visualise('c45')
cart_tree.visualise('cart')
           'resting electrocardio', 'max heartrate', 'exercise induced angina', 'oldpeak', 'slope peak', \
           'number of vessels', 'thal', 'disease']
df = read_csv(os.path.join(os.path.join('..', 'data'), 'heart.dat'), sep=' ')
df.columns=columns
df=df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['disease'].copy()
features_df = df.copy()
features_df = features_df.drop('disease', axis=1)
train_labels_df = labels_df
train_features_df = features_df[['max heartrate', 'resting blood pressure']]
feature_mins = {}
feature_maxs = {}
feature_column_names = ['max heartrate', 'resting blood pressure']
for feature in feature_column_names:
        feature_mins[feature] = np.min(df[feature])
        feature_maxs[feature] = np.max(df[feature])

merger = DecisionTreeMerger()
# cart = CARTConstructor(min_samples_leaf=10, max_depth=2)
cart = C45Constructor(cf=1.0)
tree = cart.construct_tree(train_features_df, train_labels_df)
tree.populate_samples(train_features_df, train_labels_df['cat'])
tree.visualise("2d_tree")

regions = merger.decision_tree_to_decision_table(tree, train_features_df)
print regions
merger.plot_regions("2d_regions", regions, ['1', '2'],
                    "max heartrate", "resting blood pressure", y_max=feature_maxs["resting blood pressure"],
                    x_max=feature_maxs["max heartrate"], y_min=feature_mins["resting blood pressure"],
                    x_min=feature_mins["max heartrate"])
示例#8
0
import time
from sklearn.cross_validation import StratifiedShuffleSplit

from constructors.c45orangeconstructor import C45Constructor
from constructors.treemerger import DecisionTreeMerger
from constructors.treemerger_clean import DecisionTreeMergerClean
from data.load_datasets import load_led7

import numpy as np

merger = DecisionTreeMergerClean()
merger2 = DecisionTreeMerger()
df, features, label, name = load_led7()
c45 = C45Constructor()

skf = StratifiedShuffleSplit(df[label], 1, test_size=0.5, random_state=1337)

feature_mins = {}
feature_maxs = {}
for feature in features:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])

for fold, (train_idx, test_idx) in enumerate(skf):
    # print 'Fold', fold+1, '/', NR_FOLDS
    train = df.iloc[train_idx, :].reset_index(drop=True)
    X_train = train.drop(label, axis=1)
    y_train = train[label]
    test = df.iloc[test_idx, :].reset_index(drop=True)
    X_test = test.drop(label, axis=1)
    y_test = test[label]
    new_features[feature_column_names[best_features[k]]] = features_df[feature_column_names[best_features[k]]]
features_df = new_features
feature_column_names = list(set(df.columns) - set(['disease']))

c45 = C45Constructor(cf=0.01)
cart = CARTConstructor(min_samples_leaf=10, max_depth=6)
quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=25, alpha=0.05)
tree_constructors = [c45, cart, quest]
#
tree_confusion_matrices = {}
for tree_constructor in tree_constructors:
    tree_confusion_matrices[tree_constructor.get_name() + " Tree"] = []
    tree_confusion_matrices[tree_constructor.get_name() + " Regions"] = []
tree_confusion_matrices["Genetic Regions"] = []
tree_confusion_matrices["Genetic Tree"] = []
merger = DecisionTreeMerger()

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED)

for train_index, test_index in skf:
    train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy()
    train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
    train_features_df = train_features_df.reset_index(drop=True)
    test_features_df = test_features_df.reset_index(drop=True)
    train_labels_df = train_labels_df.reset_index(drop=True)
    test_labels_df = test_labels_df.reset_index(drop=True)
    train_df = train_features_df.copy()
    train_df['cat'] = train_labels_df['cat'].copy()

    trees = []
示例#10
0
cart = CARTConstructor(min_samples_leaf=10)
#c45_2 = C45Constructor(cf=0.15)
#c45_3 = C45Constructor(cf=0.75)
quest = QuestConstructor(default=1,
                         max_nr_nodes=5,
                         discrete_thresh=10,
                         alpha=0.1)
tree_constructors = [c45, cart, quest]
trees = []
for tree_constructor in tree_constructors:
    tree = tree_constructor.construct_tree(train_features_df, train_labels_df)
    tree.visualise(
        os.path.join(os.path.join('..', 'data'), tree_constructor.get_name()))
    trees.append(tree)

merger = DecisionTreeMerger()
best_tree, constructed_trees = merger.genetic_algorithm(train_df,
                                                        'cat',
                                                        tree_constructors,
                                                        seed=SEED,
                                                        num_iterations=2)
best_tree.visualise(os.path.join(os.path.join('..', 'data'), 'best_tree'))
trees.append(best_tree)
# trees.extend(constructed_trees)

columns = [
    'PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
    'Fare', 'Cabin', 'Embarked'
]
df = read_csv(os.path.join(os.path.join('..', 'data'), 'titanic_test.csv'),
              sep=',')
示例#11
0
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix

from constructors.c45orangeconstructor import C45Constructor
from constructors.treemerger import DecisionTreeMerger
from constructors.treemerger_clean import DecisionTreeMergerClean
from data.load_datasets import load_austra
from sklearn import preprocessing

import numpy as np
import pandas as pd
import collections
import operator

merger = DecisionTreeMergerClean()
merger2 = DecisionTreeMerger()
df, features, label, name = load_austra()
c45 = C45Constructor(cf=0.0)

skf = StratifiedShuffleSplit(df[label], 1, test_size=0.25, random_state=1337)

feature_mins = {}
feature_maxs = {}
for feature in features:
    feature_mins[feature] = 0.0
    feature_maxs[feature] = 1.0

for fold, (train_idx, test_idx) in enumerate(skf):
    # print 'Fold', fold+1, '/', NR_FOLDS
    train = df.iloc[train_idx, :].reset_index(drop=True)
    X_train = train.drop(label, axis=1)
示例#12
0
df.columns = columns
df = df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['disease'].copy()
features_df = df.copy()
features_df = features_df.drop('disease', axis=1)
train_labels_df = labels_df
train_features_df = features_df[['max heartrate', 'resting blood pressure']]
feature_mins = {}
feature_maxs = {}
feature_column_names = ['max heartrate', 'resting blood pressure']
for feature in feature_column_names:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])

merger = DecisionTreeMerger()
# cart = CARTConstructor(min_samples_leaf=10, max_depth=2)
cart = C45Constructor(cf=1.0)
tree = cart.construct_tree(train_features_df, train_labels_df)
tree.populate_samples(train_features_df, train_labels_df['cat'])
tree.visualise("2d_tree")

regions = merger.decision_tree_to_decision_table(tree, train_features_df)
print regions
merger.plot_regions("2d_regions",
                    regions, ['1', '2'],
                    "max heartrate",
                    "resting blood pressure",
                    y_max=feature_maxs["resting blood pressure"],
                    x_max=feature_maxs["max heartrate"],
                    y_min=feature_mins["resting blood pressure"],