def fit(X, y, max_depth=np.inf, n_bootstrap=50): trees = [] for m in range(n_bootstrap): # if bootstrapped: randomVal = np.random.choice(X.shape[0], X.shape[0]) X = X[randomVal] y = y[randomVal] trees.append(decision_tree.fit(X, y, max_depth)) model = dict() model['trees'] = trees model['predict'] = predict return model
def fit(x, y, t, num_trees, max_tree_nodes, min_samples_leaf, min_samples_split, class_majority, measure, separate_max): forest = [] for i in range(num_trees): x_train, y_train, x_out_of_bag, y_out_of_bag = bootstrap(x, y) tree = decision_tree.fit( x=x_train, y=y_train, t=t, randomized=True, max_tree_nodes=max_tree_nodes, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, class_majority=class_majority, measure=measure, separate_max=separate_max) forest.append(tree) return forest
def fit(self, X, y, dist, max_depth): tree = dt.fit(X, y, dist, max_depth) self.tree = tree
def map_fit(interface, state, label, inp): import numpy as np import decision_tree, measures attr_mapping, y_mapping = {}, {} x, y, fill_in_values = [], [], [] out = interface.output(0) missing_vals_attr = set() for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average( [sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([ sample[i] for sample in x if type(sample[i]) == int ]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value tree = decision_tree.fit(x=np.array(x, dtype=np.float32), y=np.array(y, dtype=np.uint16), t=state["X_meta"], randomized=False, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2]) ]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) out.add("tree", tree_mapped) out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp): import numpy as np from itertools import permutations import decision_tree, measures, k_medoids out = interface.output(0) x, y, margins, forest = [], [], [], [] attr_mapping, y_mapping, similarity_mat = {}, {}, {} missing_vals_attr = set() for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return fill_in_values = [] attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average( [sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([ sample[i] for sample in x if type(sample[i]) == int ]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x, y = np.array(x), np.array(y) iteration = 0 while len(forest) < state["trees_per_chunk"]: if iteration == state["trees_per_chunk"] * 2: return bag_indices = np.random.randint(len(x), size=(len(x))) unique = set(bag_indices) out_of_bag_indices = [i for i in range(len(x)) if i not in unique][:500] iteration += 1 if len(np.unique(y[bag_indices])) == 1: continue tree = decision_tree.fit(x=x[bag_indices], y=y[bag_indices], t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) if len(tree) < 2: continue # calculate margins tree_margins, leafs_grouping = {}, {} for j in out_of_bag_indices: leaf, margin = decision_tree.predict(tree, x[j], y[j]) tree_margins[j] = margin if leaf in leafs_grouping: leafs_grouping[leaf].append(j) else: leafs_grouping[leaf] = [j] margins.append(tree_margins) for k, v in leafs_grouping.iteritems(): for cx, cy in permutations(v, 2): if cx in similarity_mat: similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1 else: similarity_mat[cx] = {cy: -1} tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2]) ]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) min_elements = [] for k, v in similarity_mat.iteritems(): min_id = min(similarity_mat[k], key=similarity_mat[k].get) min_elements.append((similarity_mat[k][min_id], min_id)) min_elements = sorted(min_elements) if state["k"] == "sqrt": k = int(np.sqrt(len(x[0]))) + 1 elif state["k"] == "square": k = len(np.unique(y)) * len(np.unique(y)) cidx = set() counter = 0 while counter < len(min_elements) and len(cidx) < k: cidx.add(min_elements[counter][1]) counter += 1 inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx)) sample_ids = np.array(similarity_mat.keys()) medoids_i = [sample_ids[i] for i in medoids_i] clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)] medoids = x[medoids_i].tolist() # set medoids without sample identifier cont, disc = [], [] for i in range(len(medoids)): cont.append([ medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c" ]) disc.append([ attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i])) if state["X_meta"][j] == "d" ]) medoids = [np.array(cont), np.array(disc)] stats = [[] for i in range(len(medoids_i))] for i in range(len(forest)): # for every tree in forest for num, cluster in enumerate(clusters): # calculate average margin for cluster values = [ margins[i][sample_id] for sample_id in cluster if int(sample_id) in margins[i] ] if values != []: avg = np.average(values) forest[i]["margin" + str(num)] = avg stats[num].append(avg) stats = [np.median(value) for value in stats] gower_range = np.array([ np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c" ]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, medoids, stats, gower_range)) out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp): import numpy as np import decision_tree, measures from collections import Counter out = interface.output(0) num_samples = sum([1 for row in inp if len(row.strip().split(state["delimiter"])) > 1]) missing_vals_attr = set() for counter in range(state["trees_per_chunk"]): bag_indices = Counter(np.random.randint(num_samples, size=(num_samples))) attr_mapping, y_mapping = {}, {} x, y, fill_in_values = [], [], [] row_num = 0 for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: while bag_indices[row_num] > 0: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(row[j]) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) bag_indices[row_num] -= 1 row_num += 1 attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average([sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x = np.array(x, dtype=np.float32) y = np.array(y, dtype=np.uint16) tree = decision_tree.fit( x=x, y=y, t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) if len(tree) < 2: continue print "tree was build" tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) out.add("tree", tree_mapped) out.add("fill_in_values", fill_in_values)
# print("Error: %.3f" % error) # # 3. Evaluate decision tree that uses information gain # tree = DecisionTreeClassifier(max_depth=3) # tree.fit(X, y) # y_pred = tree.predict(X) # error = np.mean(y_pred != y) # print("Error: %.3f" % error) for maxDepth in range(2,15): print "******* MAX DEPTH =", maxDepth, "***********" # 2. Evaluate decision tree model = decision_tree.fit(X, y, maxDepth=maxDepth) # print model y_pred = decision_tree.predict(model, X) error = np.mean(y_pred != y) # print model print("Error: %.3f" % error) # 3. Evaluate decision tree that uses information gain tree = DecisionTreeClassifier(max_depth=maxDepth+1) tree.fit(X, y) y_pred = tree.predict(X) error = np.mean(y_pred != y) print("Error: %.3f" % error)
# part 2: print training/test errors as well as number of examples for k=1 # part 3: plot classification boundaries for k=1 if question == '2.1': dataset = utils.load_dataset('vowel') X = dataset['X'] y = dataset['y'] Xtest = dataset['Xtest'] ytest = dataset['ytest'] # # part 1: plot decision_tree as depth varies from 1 to 15 train_errors = np.zeros(15) test_errors = np.zeros(15) for i in range(1, 16): model = decision_tree.fit(X, y, i) y_pred = decision_tree.predict(model, X) training_error = np.sum(y_pred != y) / float(X.shape[0]) # print "Training error:", training_error, "at depth", i y_pred = decision_tree.predict(model, Xtest) test_error = np.sum(y_pred != ytest) / float(Xtest.shape[0]) # print "Test error:", test_error, "at depth", i train_errors[i - 1] = training_error test_errors[i - 1] = test_error x_vals = np.arange(1, 16) plt.title("Tree depth vs. training and test error") plt.plot(x_vals, train_errors, label="Training error") plt.plot(x_vals, test_errors, label="Testing error")
def map_fit(interface, state, label, inp): import numpy as np from itertools import permutations import decision_tree, measures, k_medoids out = interface.output(0) x, y, margins, forest = [], [], [], [] attr_mapping, y_mapping, similarity_mat = {}, {}, {} missing_vals_attr = set() for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return fill_in_values = [] attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average([sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x, y = np.array(x), np.array(y) iteration = 0 while len(forest) < state["trees_per_chunk"]: if iteration == state["trees_per_chunk"] * 2: return bag_indices = np.random.randint(len(x), size=(len(x))) unique = set(bag_indices) out_of_bag_indices = [i for i in range(len(x)) if i not in unique][:500] iteration += 1 if len(np.unique(y[bag_indices])) == 1: continue tree = decision_tree.fit( x=x[bag_indices], y=y[bag_indices], t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"], ) if len(tree) < 2: continue # calculate margins tree_margins, leafs_grouping = {}, {} for j in out_of_bag_indices: leaf, margin = decision_tree.predict(tree, x[j], y[j]) tree_margins[j] = margin if leaf in leafs_grouping: leafs_grouping[leaf].append(j) else: leafs_grouping[leaf] = [j] margins.append(tree_margins) for k, v in leafs_grouping.iteritems(): for cx, cy in permutations(v, 2): if cx in similarity_mat: similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1 else: similarity_mat[cx] = {cy: -1} tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) min_elements = [] for k, v in similarity_mat.iteritems(): min_id = min(similarity_mat[k], key=similarity_mat[k].get) min_elements.append((similarity_mat[k][min_id], min_id)) min_elements = sorted(min_elements) if state["k"] == "sqrt": k = int(np.sqrt(len(x[0]))) + 1 elif state["k"] == "square": k = len(np.unique(y)) * len(np.unique(y)) cidx = set() counter = 0 while counter < len(min_elements) and len(cidx) < k: cidx.add(min_elements[counter][1]) counter += 1 inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx)) sample_ids = np.array(similarity_mat.keys()) medoids_i = [sample_ids[i] for i in medoids_i] clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)] medoids = x[medoids_i].tolist() # set medoids without sample identifier cont, disc = [], [] for i in range(len(medoids)): cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"]) disc.append([attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"]) medoids = [np.array(cont), np.array(disc)] stats = [[] for i in range(len(medoids_i))] for i in range(len(forest)): # for every tree in forest for num, cluster in enumerate(clusters): # calculate average margin for cluster values = [margins[i][sample_id] for sample_id in cluster if int(sample_id) in margins[i]] if values != []: avg = np.average(values) forest[i]["margin" + str(num)] = avg stats[num].append(avg) stats = [np.median(value) for value in stats] gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, medoids, stats, gower_range)) out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp): import numpy as np import decision_tree, measures, random from collections import Counter out = interface.output(0) margins, forest, medoids, medoids_y = [], [], [], [] missing_vals_attr = set() num_test_samples = state["num_medoids"] num_samples = sum( [1 for row in inp if len(row.strip().split(state["delimiter"])) > 1]) test_indices = set( random.sample([i for i in range(num_samples)], num_test_samples)) for counter in range(state["trees_per_chunk"]): bag_indices = Counter( np.random.randint(num_samples, size=(num_samples))) _ = [ bag_indices.pop(test_id) for test_id in test_indices if test_id in bag_indices ] x, y, fill_in_values = [], [], [] attr_mapping, y_mapping = {}, {} row_num = -1 for row in inp: row_num += 1 row = row.strip().split(state["delimiter"]) if len(row) > 1: if row_num in test_indices: if counter == 0: new_row = [] for i, j in enumerate(state["X_indices"]): if state["X_meta"][i] == "c": new_row.append(float(row[j])) else: new_row.append(row[j]) medoids.append(new_row) medoids_y.append(row[state["y_index"]]) else: continue else: while bag_indices[row_num] > 0: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) bag_indices[row_num] -= 1 attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average([ sample[i] for sample in x if type(sample[i]) == float ]) fill_in_values.append(value) else: value = np.bincount([ sample[i] for sample in x if type(sample[i]) == int ]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x = np.array(x, dtype=np.float32) y = np.array(y, dtype=np.uint16) tree = decision_tree.fit(x=x, y=y, t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) print "Tree was built" if len(tree) < 2: print "tree was removed" continue tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2]) ]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) tree_margins = [] for ti in range(num_test_samples): leaf, margin = decision_tree.predict(tree_mapped, medoids[ti], medoids_y[ti]) tree_margins.append(margin) margins.append(tree_margins) print "tree was build" cont, disc = [], [] for i in range(len(medoids)): cont.append([ medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c" ]) disc.append([ medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "d" ]) medoids = [np.array(cont), np.array(disc)] gower_range = np.array([ np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c" ]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, margins, medoids, gower_range)) out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp): import numpy as np import decision_tree, measures, random from collections import Counter out = interface.output(0) margins, forest, medoids, medoids_y = [], [], [], [] missing_vals_attr = set() num_test_samples = state["num_medoids"] num_samples = sum([1 for row in inp if len(row.strip().split(state["delimiter"])) > 1]) test_indices = set(random.sample([i for i in range(num_samples)], num_test_samples)) for counter in range(state["trees_per_chunk"]): bag_indices = Counter(np.random.randint(num_samples, size=(num_samples))) _ = [bag_indices.pop(test_id) for test_id in test_indices if test_id in bag_indices] x, y, fill_in_values = [], [], [] attr_mapping, y_mapping = {}, {} row_num = -1 for row in inp: row_num += 1 row = row.strip().split(state["delimiter"]) if len(row) > 1: if row_num in test_indices: if counter == 0: new_row = [] for i, j in enumerate(state["X_indices"]): if state["X_meta"][i] == "c": new_row.append(float(row[j])) else: new_row.append(row[j]) medoids.append(new_row) medoids_y.append(row[state["y_index"]]) else: continue else: while bag_indices[row_num] > 0: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) bag_indices[row_num] -= 1 attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average([sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x = np.array(x, dtype=np.float32) y = np.array(y, dtype=np.uint16) tree = decision_tree.fit( x=x, y=y, t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) print "Tree was built" if len(tree) < 2: print "tree was removed" continue tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) tree_margins = [] for ti in range(num_test_samples): leaf, margin = decision_tree.predict(tree_mapped, medoids[ti], medoids_y[ti]) tree_margins.append(margin) margins.append(tree_margins) print "tree was build" cont, disc = [], [] for i in range(len(medoids)): cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"]) disc.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"]) medoids = [np.array(cont), np.array(disc)] gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, margins, medoids, gower_range)) out.add("fill_in_values", fill_in_values)