__author__ = 'fabian' import split_gini_cython import numpy as np import DecisionTree_optimized feat = np.array([1,1,1,2,2,2,3,3,3,3], dtype=np.float32) labels = np.array([0,0,0,1,1,1,1,0,0,1], dtype=np.int32) classes = np.array([0,1], dtype=np.int32) class_distrib = np.array([5,5], dtype=np.int32) print split_gini_cython.split_gini(feat, labels, classes, class_distrib) print DecisionTree_optimized.split_gini_new(feat, labels, class_distrib) from sklearn import datasets, cross_validation iris = datasets.load_iris() x_tr, x_te, y_tr, y_te = cross_validation.train_test_split(iris.data, iris.target) dt = DecisionTree_optimized.DecisionTree() dt.train(x_tr, y_tr) pred_y = dt.predict(x_te) accur = np.sum(pred_y == y_te) / float(len(y_te)) * 100. print(accur)
__author__ = 'fabian' import split_gini_cython import numpy as np import DecisionTree_optimized feat = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 3], dtype=np.float32) labels = np.array([0, 0, 0, 1, 1, 1, 1, 0, 0, 1], dtype=np.int32) classes = np.array([0, 1], dtype=np.int32) class_distrib = np.array([5, 5], dtype=np.int32) print split_gini_cython.split_gini(feat, labels, classes, class_distrib) print DecisionTree_optimized.split_gini_new(feat, labels, class_distrib) from sklearn import datasets, cross_validation iris = datasets.load_iris() x_tr, x_te, y_tr, y_te = cross_validation.train_test_split( iris.data, iris.target) dt = DecisionTree_optimized.DecisionTree() dt.train(x_tr, y_tr) pred_y = dt.predict(x_te) accur = np.sum(pred_y == y_te) / float(len(y_te)) * 100. print(accur)
def split_node(self, max_depth=10, min_instances=10): # print "splitting node..." # print "depth ", self._depth # print "n_instances ", self.data.shape[0] features_for_split = self._get_features_for_split() # gini coefficient must be minimized by the split best_split_score = float("inf") best_split_feature = None best_split_threshold = None for j in features_for_split: curr_best_split_score, curr_best_split_threshold = \ split_gini_cython.split_gini(self.data[:, j], self.labels, self._classes, self.class_distrib) if best_split_score > curr_best_split_score: best_split_score = curr_best_split_score best_split_threshold = curr_best_split_threshold best_split_feature = j if best_split_feature is None: self._make_leaf() return [] best_split_indices_left = np.where(self.data[:, best_split_feature] <= best_split_threshold)[0] best_split_indices_right = np.where(self.data[:, best_split_feature] > best_split_threshold)[0] if (len(best_split_indices_right) == 0) or (len(best_split_indices_left) == 0): self._make_leaf() return [] self._threshold = best_split_threshold self._splitFeatureID = best_split_feature self._child_left = DecisionTreeNode(self.data[best_split_indices_left, :], self.labels[best_split_indices_left], self._classes, use_features=self._use_features, depth=self._depth + 1) self._child_right = DecisionTreeNode(self.data[best_split_indices_right, :], self.labels[best_split_indices_right], self._classes, use_features=self._use_features, depth=self._depth + 1) '''print "\nbest split feature: ", best_split_feature print "best gini coeff: ", best_split_score print "threshold: ", best_split_threshold print "n_left: ", len(best_split_indices_left) print "n_right: ", len(best_split_indices_right)''' returned_children = [] for child in [self._child_left, self._child_right]: if not child.check_constraints_violated(max_depth, min_instances): returned_children.append(child) # print "number if children returned: ", len(returned_children) # print "am I now a leaf? ", self.isLeaf # print "\n" return returned_children
def split_node(self, max_depth=10, min_instances=10): # print "splitting node..." # print "depth ", self._depth # print "n_instances ", self.data.shape[0] features_for_split = self._get_features_for_split() # gini coefficient must be minimized by the split best_split_score = float("inf") best_split_feature = None best_split_threshold = None for j in features_for_split: curr_best_split_score, curr_best_split_threshold = \ split_gini_cython.split_gini(self.data[:, j], self.labels, self._classes, self.class_distrib) if best_split_score > curr_best_split_score: best_split_score = curr_best_split_score best_split_threshold = curr_best_split_threshold best_split_feature = j if best_split_feature is None: self._make_leaf() return [] best_split_indices_left = np.where( self.data[:, best_split_feature] <= best_split_threshold)[0] best_split_indices_right = np.where( self.data[:, best_split_feature] > best_split_threshold)[0] if (len(best_split_indices_right) == 0) or (len(best_split_indices_left) == 0): self._make_leaf() return [] self._threshold = best_split_threshold self._splitFeatureID = best_split_feature self._child_left = DecisionTreeNode( self.data[best_split_indices_left, :], self.labels[best_split_indices_left], self._classes, use_features=self._use_features, depth=self._depth + 1) self._child_right = DecisionTreeNode( self.data[best_split_indices_right, :], self.labels[best_split_indices_right], self._classes, use_features=self._use_features, depth=self._depth + 1) '''print "\nbest split feature: ", best_split_feature print "best gini coeff: ", best_split_score print "threshold: ", best_split_threshold print "n_left: ", len(best_split_indices_left) print "n_right: ", len(best_split_indices_right)''' returned_children = [] for child in [self._child_left, self._child_right]: if not child.check_constraints_violated(max_depth, min_instances): returned_children.append(child) # print "number if children returned: ", len(returned_children) # print "am I now a leaf? ", self.isLeaf # print "\n" return returned_children