def regions_to_tree_improved(self,
                                 features_df,
                                 labels_df,
                                 regions,
                                 features,
                                 feature_mins,
                                 feature_maxs,
                                 max_samples=1):

        lines = self.find_lines(regions, features, feature_mins, feature_maxs)
        lines_keys = [key for key in lines.keys() if len(lines[key]) > 0]
        if lines is None or len(lines) <= 0 or len(lines_keys) <= 0:
            return DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                value=None,
                                data=features_df)

        random_label = np.random.choice(lines_keys)
        random_value = np.random.choice(lines[random_label])
        data = DataFrame(features_df)
        data['cat'] = labels_df
        best_split_node = DecisionTree(
            data=data,
            label=random_label,
            value=random_value,
            left=DecisionTree(data=data[data[random_label] <= random_value]),
            right=DecisionTree(data=data[data[random_label] > random_value]))
        node = DecisionTree(label=best_split_node.label,
                            value=best_split_node.value,
                            data=best_split_node.data)

        feature_mins_right = feature_mins.copy()
        feature_mins_right[node.label] = node.value
        feature_maxs_left = feature_maxs.copy()
        feature_maxs_left[node.label] = node.value
        regions_left = []
        regions_right = []
        for region in regions:
            if region[best_split_node.label][0] < best_split_node.value:
                regions_left.append(region)
            else:
                regions_right.append(region)
        if len(best_split_node.left.data) >= max_samples and len(
                best_split_node.right.data) >= max_samples:
            node.left = self.regions_to_tree_improved(
                best_split_node.left.data.drop('cat', axis=1),
                best_split_node.left.data[['cat']], regions_left, features,
                feature_mins, feature_maxs_left)
            node.right = self.regions_to_tree_improved(
                best_split_node.right.data.drop('cat', axis=1),
                best_split_node.right.data[['cat']], regions_right, features,
                feature_mins_right, feature_maxs)

        else:
            node.label = str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int))))
            node.value = None

        return node
Пример #2
0
def _convert_to_tree(dt, features):
    """Convert a sklearn object to a `decisiontree.decisiontree` object"""
    n_nodes = dt.tree_.node_count
    children_left = dt.tree_.children_left
    children_right = dt.tree_.children_right
    feature = dt.tree_.feature
    threshold = dt.tree_.threshold
    classes = dt.classes_

    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes)
    decision_trees = [None] * n_nodes
    for i in range(n_nodes):
        decision_trees[i] = DecisionTree()
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    for i in range(n_nodes):

        if children_left[i] > 0:
            decision_trees[i].left = decision_trees[children_left[i]]

        if children_right[i] > 0:
            decision_trees[i].right = decision_trees[children_right[i]]

        if is_leaves[i]:
            decision_trees[i].label = dt.classes_[np.argmax(
                dt.tree_.value[i][0])]
            decision_trees[i].value = None
        else:
            decision_trees[i].label = features[feature[i]]
            decision_trees[i].value = threshold[i]

    return decision_trees[0]
    def decision_tree_from_text(self, lines):

        dt = DecisionTree()

        if '<=' in lines[0] or '>' in lines[0]:
            # Intermediate node
            node_name = lines[0].split(':')[0].lstrip()
            label, value = lines[0].split(':')[1].split('<=')
            label = ' '.join(label.lstrip().rstrip().split('.'))
            value = value.lstrip().split()[0]
            dt.label = label
            dt.value = float(value)
            dt.left = self.decision_tree_from_text(lines[1:])
            counter = 1
            while lines[counter].split(':')[0].lstrip() != node_name:
                counter += 1
            dt.right = self.decision_tree_from_text(lines[counter + 1:])
        else:
            # Terminal node
            dt.label = int(eval(lines[0].split(':')[1].lstrip()))

        return dt
Пример #4
0
    def regions_to_tree_improved(self,
                                 features_df,
                                 labels_df,
                                 regions,
                                 features,
                                 feature_mins,
                                 feature_maxs,
                                 max_samples=5):
        lines = self.find_lines(regions, features, feature_mins, feature_maxs)
        # print "Found ", len(lines), " lines for ", len(regions), " regions and ", len(features), " features in ", (end-start), " seconds"
        # print lines

        if lines is None or len(lines) <= 0:
            return DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                value=None,
                                data=features_df)

        info_gains = self.calculate_info_gains(lines, features_df, labels_df)
        # print "Calculated info gains ", len(lines), " features and ", len(features_df), " samples in ", (end-start), " seconds"
        # print info_gains

        if len(info_gains) > 0:
            best_split_node = max(info_gains.items(),
                                  key=operator.itemgetter(1))[0]
            node = DecisionTree(label=best_split_node.label,
                                value=best_split_node.value,
                                data=best_split_node.data)
        else:
            node = DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                data=features_df,
                                value=None)
            return node
        # print node.label, node.value

        ##########################################################################

        # We call recursively with the splitted data and set the bounds of feature f
        # for left child: set upper bounds to the value of the chosen line
        # for right child: set lower bounds to value of chosen line
        feature_mins_right = feature_mins.copy()
        feature_mins_right[node.label] = node.value
        feature_maxs_left = feature_maxs.copy()
        feature_maxs_left[node.label] = node.value
        regions_left = []
        regions_right = []
        for region in regions:
            if region[best_split_node.label][0] < best_split_node.value:
                regions_left.append(region)
            else:
                regions_right.append(region)
        if len(regions_left) >= max_samples or len(
                regions_right) >= max_samples:
            node.left = self.regions_to_tree_improved(
                best_split_node.left.data.drop('cat', axis=1),
                best_split_node.left.data[['cat']], regions_left, features,
                feature_mins, feature_maxs_left)
            node.right = self.regions_to_tree_improved(
                best_split_node.right.data.drop('cat', axis=1),
                best_split_node.right.data[['cat']], regions_right, features,
                feature_mins_right, feature_maxs)
        else:
            node.label = str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int))))
            node.value = None

        return node