def regions_to_tree_improved(self, features_df, labels_df, regions, features, feature_mins, feature_maxs, max_samples=1): lines = self.find_lines(regions, features, feature_mins, feature_maxs) lines_keys = [key for key in lines.keys() if len(lines[key]) > 0] if lines is None or len(lines) <= 0 or len(lines_keys) <= 0: return DecisionTree(label=str( np.argmax(np.bincount(labels_df['cat'].values.astype(int)))), value=None, data=features_df) random_label = np.random.choice(lines_keys) random_value = np.random.choice(lines[random_label]) data = DataFrame(features_df) data['cat'] = labels_df best_split_node = DecisionTree( data=data, label=random_label, value=random_value, left=DecisionTree(data=data[data[random_label] <= random_value]), right=DecisionTree(data=data[data[random_label] > random_value])) node = DecisionTree(label=best_split_node.label, value=best_split_node.value, data=best_split_node.data) feature_mins_right = feature_mins.copy() feature_mins_right[node.label] = node.value feature_maxs_left = feature_maxs.copy() feature_maxs_left[node.label] = node.value regions_left = [] regions_right = [] for region in regions: if region[best_split_node.label][0] < best_split_node.value: regions_left.append(region) else: regions_right.append(region) if len(best_split_node.left.data) >= max_samples and len( best_split_node.right.data) >= max_samples: node.left = self.regions_to_tree_improved( best_split_node.left.data.drop('cat', axis=1), best_split_node.left.data[['cat']], regions_left, features, feature_mins, feature_maxs_left) node.right = self.regions_to_tree_improved( best_split_node.right.data.drop('cat', axis=1), best_split_node.right.data[['cat']], regions_right, features, feature_mins_right, feature_maxs) else: node.label = str( np.argmax(np.bincount(labels_df['cat'].values.astype(int)))) node.value = None return node
def _convert_to_tree(dt, features): """Convert a sklearn object to a `decisiontree.decisiontree` object""" n_nodes = dt.tree_.node_count children_left = dt.tree_.children_left children_right = dt.tree_.children_right feature = dt.tree_.feature threshold = dt.tree_.threshold classes = dt.classes_ # The tree structure can be traversed to compute various properties such # as the depth of each node and whether or not it is a leaf. node_depth = np.zeros(shape=n_nodes) decision_trees = [None] * n_nodes for i in range(n_nodes): decision_trees[i] = DecisionTree() is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True for i in range(n_nodes): if children_left[i] > 0: decision_trees[i].left = decision_trees[children_left[i]] if children_right[i] > 0: decision_trees[i].right = decision_trees[children_right[i]] if is_leaves[i]: decision_trees[i].label = dt.classes_[np.argmax( dt.tree_.value[i][0])] decision_trees[i].value = None else: decision_trees[i].label = features[feature[i]] decision_trees[i].value = threshold[i] return decision_trees[0]
def construct_tree(self, training_feature_vectors, labels, current_depth=0): # First find the best split feature feature, type = self.find_split_feature(training_feature_vectors.copy(), labels.copy()) # Can be removed later if len(labels) == 0: return DecisionTree(label=self.default, value=None, data=None) data = DataFrame(training_feature_vectors.copy()) data['cat'] = labels # Only pre-pruning enabled at this moment (QUEST already has very nice trees) if feature is None or len(data) == 0 or len(training_feature_vectors.index) <= self.max_nr_nodes \ or len(np.unique(data['cat'])) == 1 or self.all_feature_vectors_equal(training_feature_vectors)\ or current_depth >= self.max_depth: # Create leaf with label most occurring class label = np.argmax(np.bincount(data['cat'].values.astype(int))) return DecisionTree(label=label.astype(str), value=None, data=data) # If we don't need to pre-prune, we calculate the best possible splitting point for the best split feature split_point = self.find_best_split_point(data.copy(), feature, type) if split_point is None or math.isnan(split_point): label = np.argmax(np.bincount(data['cat'].values.astype(int))) return DecisionTree(label=label.astype(str), value=None, data=data) # Divide the data using this best split feature and value and call recursively split_node = self.divide_data(data.copy(), feature, split_point) if len(split_node.left.data) == 0 or len(split_node.right.data) == 0: label = np.argmax(np.bincount(data['cat'].values.astype(int))) return DecisionTree(label=label.astype(str), value=None, data=data) node = DecisionTree(label=split_node.label, value=split_node.value, data=split_node.data) node.left = self.construct_tree(split_node.left.data.drop('cat', axis=1), split_node.left.data[['cat']], current_depth+1) node.right = self.construct_tree(split_node.right.data.drop('cat', axis=1), split_node.right.data[['cat']], current_depth+1) return node
def decision_tree_from_text(self, lines): dt = DecisionTree() if '<=' in lines[0] or '>' in lines[0]: # Intermediate node node_name = lines[0].split(':')[0].lstrip() label, value = lines[0].split(':')[1].split('<=') label = ' '.join(label.lstrip().rstrip().split('.')) value = value.lstrip().split()[0] dt.label = label dt.value = float(value) dt.left = self.decision_tree_from_text(lines[1:]) counter = 1 while lines[counter].split(':')[0].lstrip() != node_name: counter += 1 dt.right = self.decision_tree_from_text(lines[counter + 1:]) else: # Terminal node dt.label = int(eval(lines[0].split(':')[1].lstrip())) return dt
def regions_to_tree_improved(self, features_df, labels_df, regions, features, feature_mins, feature_maxs, max_samples=5): lines = self.find_lines(regions, features, feature_mins, feature_maxs) # print "Found ", len(lines), " lines for ", len(regions), " regions and ", len(features), " features in ", (end-start), " seconds" # print lines if lines is None or len(lines) <= 0: return DecisionTree(label=str( np.argmax(np.bincount(labels_df['cat'].values.astype(int)))), value=None, data=features_df) info_gains = self.calculate_info_gains(lines, features_df, labels_df) # print "Calculated info gains ", len(lines), " features and ", len(features_df), " samples in ", (end-start), " seconds" # print info_gains if len(info_gains) > 0: best_split_node = max(info_gains.items(), key=operator.itemgetter(1))[0] node = DecisionTree(label=best_split_node.label, value=best_split_node.value, data=best_split_node.data) else: node = DecisionTree(label=str( np.argmax(np.bincount(labels_df['cat'].values.astype(int)))), data=features_df, value=None) return node # print node.label, node.value ########################################################################## # We call recursively with the splitted data and set the bounds of feature f # for left child: set upper bounds to the value of the chosen line # for right child: set lower bounds to value of chosen line feature_mins_right = feature_mins.copy() feature_mins_right[node.label] = node.value feature_maxs_left = feature_maxs.copy() feature_maxs_left[node.label] = node.value regions_left = [] regions_right = [] for region in regions: if region[best_split_node.label][0] < best_split_node.value: regions_left.append(region) else: regions_right.append(region) if len(regions_left) >= max_samples or len( regions_right) >= max_samples: node.left = self.regions_to_tree_improved( best_split_node.left.data.drop('cat', axis=1), best_split_node.left.data[['cat']], regions_left, features, feature_mins, feature_maxs_left) node.right = self.regions_to_tree_improved( best_split_node.right.data.drop('cat', axis=1), best_split_node.right.data[['cat']], regions_right, features, feature_mins_right, feature_maxs) else: node.label = str( np.argmax(np.bincount(labels_df['cat'].values.astype(int)))) node.value = None return node