def _orange_dt_to_my_dt(self, orange_dt_root): # Check if leaf if orange_dt_root.node_type == Orange.classification.tree.C45Node.Leaf: return decisiontree.DecisionTree(left=None, right=None, label=str(int(orange_dt_root.leaf)), data=None, value=None) else: dt = decisiontree.DecisionTree(label=orange_dt_root.tested.name, data=None, value=orange_dt_root.cut) dt.left = self._orange_dt_to_my_dt(orange_dt_root.branch[0]) dt.right = self._orange_dt_to_my_dt(orange_dt_root.branch[1]) return dt
def trainEnsemble(data, numTrees, predictionIndex): ensemble = [] for tree_index in range(numTrees): train, test = bootstrap(data) # print("treino: "+ str(len(train)) + "\n" + str(train)) root = dt.DecisionTree(predictedIndex=predictionIndex) root.makeRootNode(train, data) root.induce(root.data, root.listOfAttr) ensemble += [root] return ensemble
def convert_to_tree(classifier, features): """ Converts the DecisionTreeClassifier from sklearn (adapted CART) to DecisionTree from decisiontree.py :param classifier: the trained classifier :param features: the features used in the classifier :return: a DecisionTree from decisiontree.py """ n_nodes = classifier.tree_.node_count children_left = classifier.tree_.children_left children_right = classifier.tree_.children_right feature = classifier.tree_.feature threshold = classifier.tree_.threshold classes = classifier.classes_ # The tree structure can be traversed to compute various properties such # as the depth of each node and whether or not it is a leaf. node_depth = np.zeros(shape=n_nodes) decision_trees = [None] * n_nodes for i in range(n_nodes): decision_trees[i] = decisiontree.DecisionTree() is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True for i in range(n_nodes): if children_left[i] > 0: decision_trees[i].left = decision_trees[children_left[i]] if children_right[i] > 0: decision_trees[i].right = decision_trees[children_right[i]] if is_leaves[i]: decision_trees[i].label = classes[np.argmax( classifier.tree_.value[i][0])] decision_trees[i].value = None else: decision_trees[i].label = features[feature[i]] decision_trees[i].value = threshold[i] return decision_trees[0]
def trainDecisionTree(pkgDict, settings=None): totalCorrectness, trainBatch, count = 0, [], 0 for key in pkgDict.keys(): trainBatch.append(pkgDict[key][TRAIN]) trainBatch = [data for sublist in trainBatch for data in sublist] discretizedDataset, bins = discretize(trainBatch, settings) decisionTree = d.DecisionTree(settings, bins) root = decisionTree.train(discretizedDataset) decisionTree.generateGraphviz(root) for key in pkgDict.keys(): for sample in pkgDict[key][TEST]: if key == decisionTree.classify(root,sample): totalCorrectness += 1 count += 1 return totalCorrectness / count
def _convert_to_tree(self): """Convert a sklearn object to a `decisiontree.decisiontree` object""" n_nodes = self.dt.tree_.node_count children_left = self.dt.tree_.children_left children_right = self.dt.tree_.children_right feature = self.dt.tree_.feature threshold = self.dt.tree_.threshold classes = self.dt.classes_ # The tree structure can be traversed to compute various properties such # as the depth of each node and whether or not it is a leaf. node_depth = np.zeros(shape=n_nodes) decision_trees = [None] * n_nodes for i in range(n_nodes): decision_trees[i] = decisiontree.DecisionTree() is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True for i in range(n_nodes): if children_left[i] > 0: decision_trees[i].left = decision_trees[children_left[i]] if children_right[i] > 0: decision_trees[i].right = decision_trees[children_right[i]] if is_leaves[i]: decision_trees[i].label = self.dt.classes_[np.argmax( self.dt.tree_.value[i][0])] decision_trees[i].value = None else: decision_trees[i].label = self.features[feature[i]] decision_trees[i].value = threshold[i] return decision_trees[0]
def _decision_tree_from_text(self, lines): dt = decisiontree.DecisionTree() if '<=' in lines[0] or '>' in lines[0]: # Intermediate node node_name = lines[0].split(':')[0].lstrip() label, value = lines[0].split(':')[1].split('<=') label = ' '.join(label.lstrip().rstrip().split('.')) value = value.lstrip().split()[0] dt.label = label dt.value = float(value) dt.left = self._decision_tree_from_text(lines[1:]) counter = 1 while lines[counter].split(':')[0].lstrip() != node_name: counter+=1 dt.right = self._decision_tree_from_text(lines[counter + 1:]) else: # Terminal node dt.label = int(eval(lines[0].split(':')[1].lstrip())) return dt
def build_dt_from_ensemble(decision_trees, data, class_label, tests, prior_entropy, prior_tests={}, min_nr_samples=1, calc_fracs_from_ensemble=False): """ Given an ensemble of decision trees, build a single decision tree using estimates from the ensemble :param decision_trees: the ensembles of decision trees :param data: the training data frame :param class_label: the column with :param tests: all possible tests (calculated from the ensemble) :param prior_entropy: recursive parameter to calculate information gain :param prior_tests: the tests that are already picked for our final decision tree :param min_nr_samples: pre-prune condition, the data must be larger than this parameter :return: a single decision tree, calculated using information from the ensemble """ # Pre-pruning conditions: # - if the length of data is <= min_nr_samples # - when we have no tests left # - when there is only 1 unique class in the data left # print len(data), len(tests), np.unique(data[class_label].values) if len(data) > min_nr_samples and len(tests) > 0 and len( np.unique(data[class_label].values)) > 1: max_ig = 0 best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = [ None ] * 4 best_dt = decisiontree.DecisionTree() # Find the test that results in the maximum information gain for test in tests: pos_avg_probs, neg_avg_probs, pos_fraction, neg_fraction = {}, {}, 0.0, 0.0 for dt in decision_trees: pos_prob_dict = calculate_prob_dict(dt, test[0], test[1], prior_tests, False) neg_prob_dict = calculate_prob_dict(dt, test[0], test[1], prior_tests, True) if not any(math.isnan(x) for x in pos_prob_dict.values()) and not any( math.isnan(x) for x in neg_prob_dict.values()): pos_avg_probs = add_reduce_by_key( pos_avg_probs, calculate_prob_dict(dt, test[0], test[1], prior_tests, False)) neg_avg_probs = add_reduce_by_key( neg_avg_probs, calculate_prob_dict(dt, test[0], test[1], prior_tests, True)) if calc_fracs_from_ensemble and len(data) > 0: pos_fraction += float( len(dt.data[dt.data[test[0]] <= test[1]])) / len( dt.data) neg_fraction += float( len(dt.data[dt.data[test[0]] > test[1]])) / len( dt.data) for key in pos_avg_probs: pos_avg_probs[key] /= len(decision_trees) for key in neg_avg_probs: neg_avg_probs[key] /= len(decision_trees) if calc_fracs_from_ensemble: pos_fraction /= float(len(decision_trees)) neg_fraction /= float(len(decision_trees)) pos_entropy = calculate_entropy( np.divide(list(pos_avg_probs.values()), len(decision_trees))) neg_entropy = calculate_entropy( np.divide(list(neg_avg_probs.values()), len(decision_trees))) pos_data = data[data[test[0]] <= test[1]].copy() neg_data = data[data[test[0]] > test[1]].copy() if not calc_fracs_from_ensemble: pos_fraction = float(len(pos_data)) / float(len(data)) neg_fraction = float(len(neg_data)) / float(len(data)) weighted_entropy = pos_fraction * pos_entropy + neg_fraction * neg_entropy information_gain = prior_entropy - weighted_entropy if information_gain > max_ig and len(pos_data) > 0 and len( neg_data) > 0: max_ig, best_dt.label, best_dt.value = information_gain, test[ 0], test[1] best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = pos_data, neg_data, pos_entropy, neg_entropy # print max_ig if max_ig == 0: # If we can't find a test that results in an information gain, we can pre-prune return decisiontree.DecisionTree(value=None, label=get_most_occurring_class( data, class_label)) # Update some variables and do recursive calls left_prior_tests = prior_tests.copy() left_prior_tests.update({(best_dt.label, best_dt.value): True}) new_tests = tests.copy() new_tests.remove((best_dt.label, best_dt.value)) best_dt.left = build_dt_from_ensemble(decision_trees, best_pos_data, class_label, new_tests, best_pos_entropy, left_prior_tests, min_nr_samples) right_prior_tests = prior_tests.copy() right_prior_tests.update({(best_dt.label, best_dt.value): False}) best_dt.right = build_dt_from_ensemble(decision_trees, best_neg_data, class_label, new_tests, best_neg_entropy, right_prior_tests, min_nr_samples) # print (best_dt.label, best_dt.value) return best_dt else: # print '?????' return decisiontree.DecisionTree(value=None, label=get_most_occurring_class( data, class_label))
def testInduce(data): root = dt.DecisionTree() root.makeRootNode(data, data) root.induce(root.data, root.listOfAttr) root._print(0) return root
inst_oneR.begin_oneR() list_rulesOneR = inst_oneR.get_final_rules() for rule in list_rulesOneR: print "set of rules: " print rule inst_logR = log_regression.LogisticRegression(data_reader.copy(), class_values) print """ ---------------- """ print("Logistic Regression algorithm...") inst_logR.begin_alg() param_classif = inst_logR.get_param() print param_classif inst_dtree = decisiontree.DecisionTree(data_reader.copy(), class_values) print """ ---------------- """ print("Decision Tree Classifier algorithm...") inst_dtree.begin_alg() param_classif = inst_dtree.get_param() print param_classif inst_svm = svm.SVM(data_reader.copy(), class_values) print """ ---------------- """ print("Support Vector Machines(SVM) algorithm...") inst_svm.begin_alg() param_weight = inst_svm.get_param()
def build_dt_from_ensemble(decision_trees, data, class_label, tests, prior_entropy, prior_tests={}, min_nr_samples=1, calc_fracs_from_ensemble=False): """ Given an ensemble of decision trees, build a single decision tree using estimates from the ensemble **Params** ---------- - `decision_trees` (list of `decisiontree.DecisionTree` objects): the ensemble of decision trees to be merged - `data` (pandas DataFrame): the data frame with training data - `class_label` (string): the column identifier for the column with class labels in the data - `tests` (set of tuples): all possible tests (extracted from the ensemble) - `prior_entropy` (float): recursive parameter to calculate information gain - `prior_tests` (set of tuples): the tests that are already picked for our final decision tree - `min_nr_samples` (int): pre-prune condition, stop searching if number of samples is smaller or equal than threshold - `calc_fracs_from_ensemble` (boolean): if `True`, the different probabilities are calculated using the ensemble. Else, the data is used **Returns** ----------- a single decision tree, calculated using information from the ensemble """ # Pre-pruning conditions: # - if the length of data is <= min_nr_samples # - when we have no tests left # - when there is only 1 unique class in the data left # print len(data), len(tests), np.unique(data[class_label].values) if len(data) > min_nr_samples and len(tests) > 0 and len( np.unique(data[class_label].values)) > 1: max_ig = 0 best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = [ None ] * 4 best_dt = decisiontree.DecisionTree() # Find the test that results in the maximum information gain for test in tests: pos_avg_probs, neg_avg_probs, pos_fraction, neg_fraction = {}, {}, 0.0, 0.0 for dt in decision_trees: pos_prob_dict = _calculate_prob_dict(dt, test[0], test[1], prior_tests, False) neg_prob_dict = _calculate_prob_dict(dt, test[0], test[1], prior_tests, True) if not any(math.isnan(x) for x in pos_prob_dict.values()) and not any( math.isnan(x) for x in neg_prob_dict.values()): pos_avg_probs = _add_reduce_by_key( pos_avg_probs, _calculate_prob_dict(dt, test[0], test[1], prior_tests, False)) neg_avg_probs = _add_reduce_by_key( neg_avg_probs, _calculate_prob_dict(dt, test[0], test[1], prior_tests, True)) if calc_fracs_from_ensemble and len(data) > 0: pos_fraction += float( len(dt.data[dt.data[test[0]] <= test[1]])) / len( dt.data) neg_fraction += float( len(dt.data[dt.data[test[0]] > test[1]])) / len( dt.data) for key in pos_avg_probs: pos_avg_probs[key] /= len(decision_trees) for key in neg_avg_probs: neg_avg_probs[key] /= len(decision_trees) if calc_fracs_from_ensemble: pos_fraction /= float(len(decision_trees)) neg_fraction /= float(len(decision_trees)) pos_entropy = _calculate_entropy( np.divide(list(pos_avg_probs.values()), len(decision_trees))) neg_entropy = _calculate_entropy( np.divide(list(neg_avg_probs.values()), len(decision_trees))) pos_data = data[data[test[0]] <= test[1]].copy() neg_data = data[data[test[0]] > test[1]].copy() if not calc_fracs_from_ensemble: pos_fraction = float(len(pos_data)) / float(len(data)) neg_fraction = float(len(neg_data)) / float(len(data)) weighted_entropy = pos_fraction * pos_entropy + neg_fraction * neg_entropy information_gain = prior_entropy - weighted_entropy if information_gain > max_ig and len(pos_data) > 0 and len( neg_data) > 0: max_ig, best_dt.label, best_dt.value = information_gain, test[ 0], test[1] best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = pos_data, neg_data, pos_entropy, neg_entropy # print max_ig if max_ig == 0: # If we can't find a test that results in an information gain, we can pre-prune return decisiontree.DecisionTree(value=None, label=_get_most_occurring_class( data, class_label)) # Update some variables and do recursive calls left_prior_tests = prior_tests.copy() left_prior_tests.update({(best_dt.label, best_dt.value): True}) new_tests = tests.copy() new_tests.remove((best_dt.label, best_dt.value)) best_dt.left = build_dt_from_ensemble(decision_trees, best_pos_data, class_label, new_tests, best_pos_entropy, left_prior_tests, min_nr_samples) right_prior_tests = prior_tests.copy() right_prior_tests.update({(best_dt.label, best_dt.value): False}) best_dt.right = build_dt_from_ensemble(decision_trees, best_neg_data, class_label, new_tests, best_neg_entropy, right_prior_tests, min_nr_samples) return best_dt else: return decisiontree.DecisionTree(value=None, label=_get_most_occurring_class( data, class_label))
with open(join(tDir, 'testData.csv'), 'r') as dataFile: with open(join(tDir, 'testLabels.csv'), 'r') as labelFile: print tDir + ':', tree.testFile(dataFile, labelFile) if 'load' in args: filename = args['load'][0] try: with open(filename, 'rb') as treeFile: tree = decisiontree.load(treeFile) except: print filename, 'could not be loaded!' quit() else: print '\nTRAINING\n' tree = decisiontree.DecisionTree() if 'train' in args: for tDir in args['train']: trainTree(tDir) else: tree = decisiontree.DecisionTree() trainTree() saveTree() if not 'test' in args: print '\nTESTING\n' testTree() if 'save_tree' in args: filename = args['save_tree'][0] if filename == 'def': saveTree()
list_rulesOneR = inst_oneR.get_final_rules() for rule in list_rulesOneR: print "set of rules: " print rule OneRvalidation_acc = inst_oneR.get_average_acc() inst_logR = log_regression.LogisticRegression(data_reader.copy(), class_values) print """ ---------------- """ print("Logistic Regression algorithm...") inst_logR.begin_alg() param_classif = inst_logR.get_param() #print param_classif inst_dtree = decisiontree.DecisionTree(data_reader.copy(), class_values, feature_selection_function, KBest) print """ ---------------- """ print("Decision Tree Classifier algorithm...") inst_dtree.begin_alg() param_classif = inst_dtree.get_param() #print param_classif inst_svm = svm.SVM(data_reader.copy(), class_values) print """ ---------------- """ print("Support Vector Machines(SVM) algorithm...") inst_svm.begin_alg() param_weight = inst_svm.get_param()
def convertToTree(self, verbose=False): # Using those arrays, we can parse the tree structure: # label = naam feature waarop je splitst # value = is de value van de feature waarop je splitst # ownDecisionTree. n_nodes = self.dt.tree_.node_count children_left = self.dt.tree_.children_left children_right = self.dt.tree_.children_right feature = self.dt.tree_.feature threshold = self.dt.tree_.threshold classes = self.dt.classes_ # The tree structure can be traversed to compute various properties such # as the depth of each node and whether or not it is a leaf. node_depth = np.zeros(shape=n_nodes) decision_trees = [None] * n_nodes for i in range(n_nodes): decision_trees[i] = decisiontree.DecisionTree() is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True if verbose: print("The binary tree structure has %s nodes and has " "the following tree structure:" % n_nodes) for i in range(n_nodes): if children_left[i] > 0: decision_trees[i].left = decision_trees[children_left[i]] if children_right[i] > 0: decision_trees[i].right = decision_trees[children_right[i]] if is_leaves[i]: # decision_trees[i].label = self.dt.classes_[self.dt.tree_.value[i][0][1]] decision_trees[i].label = self.dt.classes_[np.argmax( self.dt.tree_.value[i][0])] decision_trees[i].value = None # if verbose: # print(bcolors.OKBLUE + "%snode=%s leaf node." % (node_depth[i] * "\t", i)) + bcolors.ENDC else: decision_trees[i].label = self.features[feature[i]] decision_trees[i].value = threshold[i] # if verbose: # print("%snode=%s test node: go to node %s if %s %s <= %s %s else to " # "node %s." # % (node_depth[i] * "\t", # i, # children_left[i], # bcolors.BOLD, # self.features[feature[i]], # threshold[i], # bcolors.ENDC, # children_right[i], # )) return decision_trees[0]
def tree(): x = decisiontree.DecisionTree() mytree = x.createDecisionTreeModel() return str(mytree)
for i in range(28): #print("entropy vect ",i,dt.entropy(datax[i])) entropy_cond = dtree.entropy_cond( [datay[datax[:, i] == 1], datay[datax[:, i] == 0]]) print("Entropie conditionelle ", i, ", ", fields[i], " : ", entropy_cond) print("Différence : ", entropy - entropy_cond) #Si la différence entre l'entropie et l'entropie conditionelle vaut 0, #cela signifie que l'attribut ne permet pas de faire baisser l'entropie du tout, #le "gain d'informations" est donc nul. #Pour la première partition, le meilleur attribut est l'attribut "drama" car c'est #qui a la plus grande différence entre l’entropie et l’entropie conditionnelle, #autrement dit c'est l'attribut qui permet de minimiser le plus l'entropie for i in range(0, depth): dt = dtree.DecisionTree() dt.max_depth = i # on fixe la taille de l ’ arbre a i (avec i entre 1 et 10) dt.min_samples_split = 2 # nombre minimum d ’ exemples pour spliter un noeud dt.fit(datax, datay) dt.predict(datax[:5, :]) print("Score pour la profondeur " + str(i) + " : " + str(dt.score(datax, datay))) # dessine l ’ arbre dans un fichier pdf si pydot est installe . #dt.to_pdf ("./tmp/testtree_deep_"+str(i)+".pdf", fields ) # sinon utiliser http :// www . webgraphviz . com / #dt.to_dot ( fields ) # ou dans la console #print ( dt.print_tree ( fields )) ###### Question 1.4