def train(self, D, X, Y): for i in range(self.num_trees): x = random.sample(X, int(len(X) * self.max_X)) d = D.sample(frac=self.max_samples) tree = DecisionTree() tree.train(d, x, Y) self.trees.append(tree)
def computeDecisionTreeCrossValidation(args, dict_algorithms): if (args.debug): print("Running decision tree...", end='') model = DecisionTree(args) dict_algorithms["decision_tree"] = model.computeCrossValidation() if (args.debug): print("ok!")
def __init__(self): self.forest_tree = {} self.test_list = [] self.tree = DecisionTree() self.sample = BalanceSample() self.file_name = open("/Users/homelink/storein/rent.txt", "r") self.datas = []
def regions_to_tree_improved(self, features_df, labels_df, regions, features, feature_mins, feature_maxs, max_samples=1): lines = self.find_lines(regions, features, feature_mins, feature_maxs) lines_keys = [key for key in lines.keys() if len(lines[key]) > 0] if lines is None or len(lines) <= 0 or len(lines_keys) <= 0: return DecisionTree(label=str( np.argmax(np.bincount(labels_df['cat'].values.astype(int)))), value=None, data=features_df) random_label = np.random.choice(lines_keys) random_value = np.random.choice(lines[random_label]) data = DataFrame(features_df) data['cat'] = labels_df best_split_node = DecisionTree( data=data, label=random_label, value=random_value, left=DecisionTree(data=data[data[random_label] <= random_value]), right=DecisionTree(data=data[data[random_label] > random_value])) node = DecisionTree(label=best_split_node.label, value=best_split_node.value, data=best_split_node.data) feature_mins_right = feature_mins.copy() feature_mins_right[node.label] = node.value feature_maxs_left = feature_maxs.copy() feature_maxs_left[node.label] = node.value regions_left = [] regions_right = [] for region in regions: if region[best_split_node.label][0] < best_split_node.value: regions_left.append(region) else: regions_right.append(region) if len(best_split_node.left.data) >= max_samples and len( best_split_node.right.data) >= max_samples: node.left = self.regions_to_tree_improved( best_split_node.left.data.drop('cat', axis=1), best_split_node.left.data[['cat']], regions_left, features, feature_mins, feature_maxs_left) node.right = self.regions_to_tree_improved( best_split_node.right.data.drop('cat', axis=1), best_split_node.right.data[['cat']], regions_right, features, feature_mins_right, feature_maxs) else: node.label = str( np.argmax(np.bincount(labels_df['cat'].values.astype(int)))) node.value = None return node
def fit(self, X, y): self.tree = [] for _ in range(self.n_trees): tree = DecisionTree(min_samples_split=self.min_samples_split, max_depth=self.max_depth, n_features=self.n_feature) x_sample, y_sample = bootstrap_sample(X, y) tree.fit(x_sample, y_sample) self.trees.append(tree)
def predict(depth, x, y, x_test, y_test): dt = DecisionTree() dt.max_depth = depth # on fixe la taille de l ’ arbre a 5 dt.min_samples_split = 2 # nombre minimum d ’ exemples pour spliter un noeud dt.fit(x, y) dt.predict(x_test[:5, :]) score = dt.score(x_test, y_test) print(score) return (score)
def main(): word_list = create_words() data, labels = readfile(word_list) tree = DecisionTree() data_train, data_test, labels_train, labels_test = \ train_test_split(data, labels, test_size=test_size, random_state=42) ## calls our tree algorithm and prediction method ## tree.train(data_train, labels_train) labels_pred = tree.predict(data_test) compute_accuracy(labels_test, labels_pred)
def fit(self, datanum, ans): for _ in range(self.num_tree): x_train, _, y_train, _ = train_test_split(datanum, ans, test_size=1.0 - self.sample_data_rate) tree = DecisionTree(x_train, y_train, rand_features=self.sample_features) tree.fit() self.trees.append(tree)
def erreurs(taux_app, data, prof_max): erreurs_train = [] erreurs_test = [] x = [i for i in range(2, prof_max)] dataTrain, dataTest = partition(taux_app, data) train_x, train_y = x_y(dataTrain) test_x, test_y = x_y(dataTest) for i in range(2, prof_max): dt = DecisionTree() dt.max_depth = i dt.min_samples_split = 2 dt.fit(train_x, train_y) erreurs_train.append(1 - dt.score(train_x, train_y)) erreurs_test.append(1 - dt.score(test_x, test_y)) import matplotlib.pyplot as plt plt.figure() plt.plot(x, erreurs_train) plt.plot(x, erreurs_test) plt.ylabel('erreur en fonction de la profondeur, taux app : ' + str(taux_app)) plt.legend(['app', 'test'], loc='upper left') plt.savefig(str(taux_app) + "erreurs.png") plt.show()
def divide_data(self, data, feature, value): """ Divide the data in two subsets, thanks pandas :param data: the dataframe to divide :param feature: on which column of the dataframe are we splitting? :param value: what threshold do we use to split :return: node: initialised decision tree object """ # print data[feature], feature, value return DecisionTree(left=DecisionTree(data=data[data[feature] <= value]), right=DecisionTree(data=data[data[feature] > value]), label=feature, data=data, value=value)
def scoreTrain(): scores = [] for depth in profondeurs: dt = DecisionTree(depth) dt.fit(datax, datay) #dt.predict(datax [:5 ,:]) scores.append(dt.score(datax, datay)) # dessine l’arbre dans un fichier pdf si pydot est installe. #dt.to_pdf("/tmp/test_tree.pdf",fields) # sinon utiliser http :// www.webgraphviz.com/ #print(dt.to_dot(fields)) #ou dans la console #print(dt.print_tree(fields )) return scores
def part5(): """We take 2 features with high class correlation to show decision boundaries for breast cancer: Uniformity_of_Cell_Shape and Uniformity_of_Cell_Size""" # Dataset 1 # KNN df = reduce_df(cancer_df, 3) cancer_features, cancer_labels = Preprocessing.get_labels_features(df) KNN.plot_decision_bound( cancer_features, cancer_labels, df.keys()[1], df.keys()[2], KNN, k=cancer_k, ) # Decision Tree DecisionTree.plot_decision_bound( cancer_features, cancer_labels, df.keys()[1], df.keys()[2], DecisionTree, max_depth=cancer_d, ) # Dataset 2 # KNN df = reduce_df(hepatitis_df, 3) hepatitis_features, hepatitis_labels = Preprocessing.get_labels_features( df) KNN.plot_decision_bound( hepatitis_features, hepatitis_labels, df.keys()[1], df.keys()[2], KNN, k=hepatitis_k, ) # Decision Tree DecisionTree.plot_decision_bound( hepatitis_features, hepatitis_labels, df.keys()[1], df.keys()[2], DecisionTree, max_depth=hepatitis_d, )
def decision_tree_learning(examples, attributes, parent_examples=()): if len(examples) == 0: return plurality_value(parent_examples) elif same_classification(examples): return DecisionLeaf(examples[0][target]) elif len(attributes) == 0: return plurality_value(examples) elif percent_error(examples) < error_threshold: return plurality_value(examples) else: a = importance(attributes, examples) tree = DecisionTree(a, dataset.attrnames[a]) for (val_i, exs_i) in split_by(a, examples): subtree = decision_tree_learning(exs_i, removeall(a, attributes), examples) tree.add(val_i, subtree) return tree
def handle_predict(argv): hypothesis = None model = None with open(argv[3], "r") as f: # DONT DO THIS ITS INSECURE. IM INSANE model = f.readline().strip('\n') hypothesis = f.readline() f.close() hypothesis = literal_eval(hypothesis) tree = None tree = DecisionTree() tree.define_positive_class(lambda x: x.classification == 'en') tree.define_classes(processing.classes) tree.define_attributes(processing.attr_definitions) examples = process_file(argv[4], training=False) examples = tree.create_examples(examples) return tree.classify(examples, hypothesis)
def train(self, data, labels): self.trees = [] for i in range(self.ITERATIONS): inds = np.random.choice(np.arange(len(data)), len(data)) self.trees.append( DecisionTree(min_leaf=self.MIN_LEAF, m=self.M, max_depth=self.MAX_DEPTH)) self.trees[-1].train(data[inds], labels[inds])
def decision_tree_learning(examples, attributes, m, parent_examples=()): if len(examples) == 0: return majority_value(parent_examples) elif same_classification(examples): return DecisionLeaf(examples[0][target]) elif len(attributes) == 0: return majority_value(examples) elif misclass_error(examples) < m: return majority_value(examples) else: A = pick_attribute(attributes, examples) tree = DecisionTree(A, dataset.attrnames[A]) nonlocal internal_nodes internal_nodes += 1 for (val_i, exs_i) in split(A, examples): subtree = decision_tree_learning(exs_i, removeall(A, attributes), m, examples) tree.add(val_i, subtree) return tree
def part4(): hepatitis_p = KNN.tune_knn_p(hepatitis_df) cancer_p = KNN.tune_knn_p(cancer_df) print("\nThe ideal P for hepatitis minkowski distance function:", hepatitis_p) print("The ideal P for breast cancer minkowski distance function:", cancer_p) hepatitis_cf = DecisionTree.tune_costfn(X_train_h, X_test_h, y_train_h, y_test_h, hepatitis_d) print( "\nThe most accurate cost function for hepatitis dataset:", hepatitis_cf.__name__, ) cancer_cf = DecisionTree.tune_costfn(X_train_c, X_test_c, y_train_c, y_test_c, cancer_d) print( "\nThe most accurate cost function for breast cancer dataset:", cancer_cf.__name__, )
def fit(self, X, y): """Build multiple trees based on training data. Args: X (numpy array): sample in shape [n x d], where n is number of samples and d is number of features. y (numpy array): sample labels in shape [n]. """ n, d = X.shape for i in range(self.tree_num): # draws random subset of features features = np.random.choice(d, self.fc, replace=False) tree = DecisionTree(self.max_depth, self.min_improv, self.eval_func) samples = np.random.choice(n, n, replace=True) X_train = X[:, features][samples, ] y_train = y[samples] tree.fit(X_train, y_train) self.features[i] = features self.trees[i] = tree
def partitionnement_test(datax, datay, rp, rdm): #rp la proportion qui sera dans le test. dt = DecisionTree() dt.min_samples_split = 2 if rdm: rp = random.uniform(0, 1) indiceap = np.random.choice(np.arange(len(datax)), int(rp * len(datax)), replace=False) indicet = [] for i in range(0, len(datax)): if i not in indiceap: indicet.append(i) testy = np.zeros((len(indicet)), int) apprentissagey = np.zeros((len(indiceap)), int) testx = np.delete(datax, indiceap, axis=0) apprentissagex = np.delete(datax, indicet, axis=0) for i in range(0, len(indiceap)): apprentissagey[i] = datay[indiceap[i]] for i in range(0, len(indicet)): testy[i] = datay[indicet[i]] l_scoretest = [] l_scoreapprentissage = [] for i in range(2, 20, 3): dt.max_depth = i dt.fit(apprentissagex, apprentissagey) dt.predict(apprentissagex[:5, :]) l_scoretest.append(1 - dt.score(testx, testy)) l_scoreapprentissage.append(1 - dt.score(apprentissagex, apprentissagey)) plt.plot(range(2, 20, 3), l_scoretest, 'r--', range(2, 20, 3), l_scoreapprentissage, 'b--') plt.show() plt.close()
def partitionnement_test(datax,datay,rp,rdm,couleur): #rp la proportion qui sera dans l'apprentissage. #rdm un booléen qui détermine si on partitionne nos ensemble au hasard. dt = DecisionTree() dt.min_samples_split = 2 if rdm: rp = random.uniform(0,1) #inceap nos indices dans datax qui vont servir pour notre apprentissage, et indicet pour nos test. #On tire indiceap aléatoirement avec la proportion rp dans datax, et on effectue des tirages sans remise. indiceap = np.random.choice(np.arange(len(datax)), int(rp*len(datax)), replace = False) indicet = [] for i in range(0,len(datax)): if i not in indiceap: indicet.append(i) testy = np.zeros((len(indicet)), int) apprentissagey = np.zeros((len(indiceap)),int) testx = np.delete(datax,indiceap,axis=0) apprentissagex = np.delete(datax,indicet,axis=0) for i in range(0,len(indiceap)): apprentissagey[i] = datay[indiceap[i]] for i in range(0,len(indicet)): testy[i] = datay[indicet[i]] l_scoretest = [] l_scoreapprentissage = [] #On test différentes profondeurs d'arbres avec comme pas de 3 pour éviter un trop long temps de calcul. for i in range(2,20,3): dt.max_depth = i dt.fit(apprentissagex ,apprentissagey) dt.predict(apprentissagex[:5 ,:]) l_scoretest.append(1 - dt.score(testx,testy)) l_scoreapprentissage.append(1 - dt.score(apprentissagex,apprentissagey)) plt.plot(range(2,20,3),l_scoretest,couleur+'--',range(2,20,3),l_scoreapprentissage,couleur) plt.show()
def part3(): hepatitis_d = DecisionTree.tune_tree_depth(X_train_h, X_test_h, y_train_h, y_test_h, training=True) cancer_d = DecisionTree.tune_tree_depth(X_train_c, X_test_c, y_train_c, y_test_c, training=True) print("\nThe ideal depth for hepatitis is (based on train accuracy):", hepatitis_d) print("The ideal depth for breast cancer is (based on train accuracy):", cancer_d) hepatitis_d = DecisionTree.tune_tree_depth(X_train_h, X_test_h, y_train_h, y_test_h) cancer_d = DecisionTree.tune_tree_depth(X_train_c, X_test_c, y_train_c, y_test_c) print("\nThe ideal depth for hepatitis is (based on test accuracy):", hepatitis_d) print("The ideal depth for breast cancer is (based on test accuracy):", cancer_d)
def _convert_to_tree(dt, features): """Convert a sklearn object to a `decisiontree.decisiontree` object""" n_nodes = dt.tree_.node_count children_left = dt.tree_.children_left children_right = dt.tree_.children_right feature = dt.tree_.feature threshold = dt.tree_.threshold classes = dt.classes_ # The tree structure can be traversed to compute various properties such # as the depth of each node and whether or not it is a leaf. node_depth = np.zeros(shape=n_nodes) decision_trees = [None] * n_nodes for i in range(n_nodes): decision_trees[i] = DecisionTree() is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True for i in range(n_nodes): if children_left[i] > 0: decision_trees[i].left = decision_trees[children_left[i]] if children_right[i] > 0: decision_trees[i].right = decision_trees[children_right[i]] if is_leaves[i]: decision_trees[i].label = dt.classes_[np.argmax( dt.tree_.value[i][0])] decision_trees[i].value = None else: decision_trees[i].label = features[feature[i]] decision_trees[i].value = threshold[i] return decision_trees[0]
def scoreTrainTest(f: float): assert (f > 0 and f <= 1) l = int(tot * f) scoresTrain = [] scoresTest = [] for depth in profondeurs: dt = DecisionTree(depth) dt.fit(datax[:l], datay[:l]) scoresTrain.append(dt.score(datax[:l], datay[:l])) scoresTest.append(dt.score(datax[l:], datay[l:])) return scoresTrain, scoresTest
def construct_tree(self, training_feature_vectors, labels, current_depth=0): # First find the best split feature feature, type = self.find_split_feature(training_feature_vectors.copy(), labels.copy()) # Can be removed later if len(labels) == 0: return DecisionTree(label=self.default, value=None, data=None) data = DataFrame(training_feature_vectors.copy()) data['cat'] = labels # Only pre-pruning enabled at this moment (QUEST already has very nice trees) if feature is None or len(data) == 0 or len(training_feature_vectors.index) <= self.max_nr_nodes \ or len(np.unique(data['cat'])) == 1 or self.all_feature_vectors_equal(training_feature_vectors)\ or current_depth >= self.max_depth: # Create leaf with label most occurring class label = np.argmax(np.bincount(data['cat'].values.astype(int))) return DecisionTree(label=label.astype(str), value=None, data=data) # If we don't need to pre-prune, we calculate the best possible splitting point for the best split feature split_point = self.find_best_split_point(data.copy(), feature, type) if split_point is None or math.isnan(split_point): label = np.argmax(np.bincount(data['cat'].values.astype(int))) return DecisionTree(label=label.astype(str), value=None, data=data) # Divide the data using this best split feature and value and call recursively split_node = self.divide_data(data.copy(), feature, split_point) if len(split_node.left.data) == 0 or len(split_node.right.data) == 0: label = np.argmax(np.bincount(data['cat'].values.astype(int))) return DecisionTree(label=label.astype(str), value=None, data=data) node = DecisionTree(label=split_node.label, value=split_node.value, data=split_node.data) node.left = self.construct_tree(split_node.left.data.drop('cat', axis=1), split_node.left.data[['cat']], current_depth+1) node.right = self.construct_tree(split_node.right.data.drop('cat', axis=1), split_node.right.data[['cat']], current_depth+1) return node
def scores_selon_prof(taux_app, data, prof_max): scores = [] x = [i for i in range(2, prof_max)] dataTrain, dataTest = partition(taux_app, data) train_x, train_y = x_y(dataTrain) test_x, test_y = x_y(dataTest) for i in range(2, prof_max): dt = DecisionTree() dt.max_depth = i dt.min_samples_split = 2 dt.fit(train_x, train_y) scores.append(dt.score(test_x, test_y)) import matplotlib.pyplot as plt plt.plot(x, scores) plt.ylabel('score en fonction de la profondeur, taux app : ' + str(taux_app)) plt.savefig(str(taux_app) + "scores.png") plt.show()
def decision_tree_from_text(self, lines): dt = DecisionTree() if '<=' in lines[0] or '>' in lines[0]: # Intermediate node node_name = lines[0].split(':')[0].lstrip() label, value = lines[0].split(':')[1].split('<=') label = ' '.join(label.lstrip().rstrip().split('.')) value = value.lstrip().split()[0] dt.label = label dt.value = float(value) dt.left = self.decision_tree_from_text(lines[1:]) counter = 1 while lines[counter].split(':')[0].lstrip() != node_name: counter += 1 dt.right = self.decision_tree_from_text(lines[counter + 1:]) else: # Terminal node dt.label = int(eval(lines[0].split(':')[1].lstrip())) return dt
def validation_croisee(n, taux_app, data, prof_max): data_app, _ = partition(taux_app, data) erreurs_moy_app = [] borders = np.linspace(0, len(data_app), n + 1, dtype=int) for depth in range(1, prof_max): print(depth) erreurs_test = [] for i in range(n): data_test = data_app[borders[i]:borders[i + 1]] if len(data_app[0:borders[i]]) > 0: data_train = np.concatenate( (data_app[0:borders[i]], data_app[borders[i + 1]:len(data_app)])) else: data_train = data_app[borders[i + 1]:len(data_app)] train_x, train_y = x_y(data_train) test_x, test_y = x_y(data_test) dt = DecisionTree() dt.max_depth = depth dt.min_samples_split = 2 dt.fit(train_x, train_y) erreurs_test.append(1 - dt.score(test_x, test_y)) print(erreurs_moy_app) erreurs_moy_app.append((1 / n) * np.array(erreurs_test).sum()) x = [i for i in range(1, prof_max)] fig = plt.figure() plt.plot(x, erreurs_moy_app) plt.xlabel( 'Erreur moyenne en fonction de la prof avec VC avec taux app de : ' + str(taux_app)) plt.legend(['app'], loc='upper left') plt.savefig(str(taux_app) + "erreursVC.png") #plt.show() return
def apprentissage(datax, datay, prop): ax = datax[:int(np.floor(prop * len(datax)))] # donnee apprentissage ay = datay[:int(np.floor(prop * len(datax)))] tx = datax[int(np.floor(prop * len(datax))):] # donnee test ty = datay[int(np.floor(prop * len(datax))):] ascore = np.zeros(9) tscore = np.zeros(9) for d in range(1, 28, 3): print("apprentissage : prop = " + str(prop) + " depth = " + str(d)) dt = DecisionTree() dt.max_depth = d # on fixe la taille de l ’ arbre a 5 dt.min_samples_split = 2 # nombre minimum d ’ exemples pour spliter un noeud dt.fit(ax, ay) ascore[int(np.floor(d / 3))] = 1 - dt.score(ax, ay) tscore[int(np.floor(d / 3))] = 1 - dt.score(tx, ty) plt.plot(range(1, 28, 3), ascore) plt.plot(range(1, 28, 3), tscore) plt.legend(["Apprentissage", "Test"]) plt.title("Proportion : " + str(prop)) plt.show()
def scoreCross(n=5): """fait la moyenne sur n tests taille test = tot/n""" assert (type(n) == int) scoresTrain = [] scoresTest = [] for depth in profondeurs: sTrain = 0 sTest = 0 for i in range(n): start = tot * i // n end = tot * (i + 1) // n dt = DecisionTree(depth) xtrain = np.vstack((datax[:start], datax[end:])) ytrain = np.hstack((datay[:start], datay[end:])) dt.fit(xtrain, ytrain) sTrain += dt.score(xtrain, ytrain) sTest += dt.score(datax[start:end], datay[start:end]) scoresTrain.append(sTrain / n) scoresTest.append(sTest / n) return scoresTrain, scoresTest
class Decision(object): def __init__(self): self.forest_tree = {} self.test_list = [] self.tree = DecisionTree() self.sample = BalanceSample() self.file_name = open("/Users/homelink/storein/rent.txt", "r") self.datas = [] def generateSample(self): positive = [] negative = [] count = 0 for data in self.file_name.readlines(): rent_dic = json.loads(data) self.datas.append([ rent_dic["business_area"], rent_dic["area"], rent_dic["width"], rent_dic["face"], rent_dic["structure"], rent_dic["height"], rent_dic["day_rent_per_centare"], rent_dic["tenancy"], rent_dic["transfer_fee"], rent_dic["licence"], rent_dic["water"], rent_dic["power"], rent_dic["fire"], rent_dic["wind"], rent_dic["gas"], rent_dic["industry"], rent_dic["is_rent"] ]) if rent_dic["is_rent"] == "True": positive.append(count) else: negative.append(count) count = count + 1 data = self.sample.over_sample(positive, negative, self.datas) self.datas = self.datas + data def run(self): self.generateSample() tree = self.tree.buildtree(self.datas, self.tree.giniimpurity_2) # prune(tree,0.1) self.tree.printtree(tree) def frequence(self, trade): trade_dic = {} trade_area = [] for i in trade: if i in trade_dic.keys(): trade_dic[i] = trade_dic[i] + 1 else: trade_dic[i] = 1 max_index = max(list(trade_dic.values())) for key, value in trade_dic.items(): if value == max_index: trade_area.append(key) return (trade_area) def accuracy(self): true_index = 0 false_index = 0 test_list = [] test = open("/Users/homelink/dianping/test.txt", "r") for line in test.readlines(): test_list = test_list + list(eval(line)) for i in range(len(test_list)): test = test_list[i] result_true = test[5] trade_store = [] for key, tree in self.forest_tree.items(): result = classify([ test[0], int(test[1]), float(test[2]), float(test[3]), float(test[4]) ], tree) max_value = 0 for key, value in result.items(): if value > max_value: max_value = value trade = key trade_store.append(trade) option_result = self.frequence(trade_store) if result_true in option_result: true_index = true_index + 1 else: false_index = false_index + 1 return true_index / len(test_list)