Пример #1
0
    def create_tree(self,
                    parent_subset=None,
                    parent=None,
                    parent_value=None,
                    remaining=None):

        if parent_subset is None:
            subset = self.data
        else:
            subset = self.filter_subset(parent_subset, parent.label,
                                        parent_value)

        if remaining is None:
            remaining = self.attributes

        use_parent = False
        counts = self.attr_counts(subset, self.dependent)
        if not counts:

            subset = parent_subset
            counts = self.attr_counts(subset, self.dependent)
            use_parent = True

        if len(counts) == 1:
            node = dtree.DTreeNode(label=counts.keys(),
                                   leaf=True,
                                   parent_value=parent_value)
        elif not remaining or use_parent:
            most_common = max(counts, key=lambda k: counts[k])
            node = dtree.DTreeNode(label=most_common,
                                   leaf=True,
                                   parent_value=parent_value,
                                   properties={'estimated': True})
        else:
            igains = []
            for attr in remaining:
                igains.append((attr, self.information_gain(subset, attr)))

            max_attr = max(igains, key=lambda a: a[1])

            node = dtree.DTreeNode(
                max_attr[0],
                properties={'information_gain': max_attr[1]},
                parent_value=parent_value)

        if parent is None:
            self.set_attributes(self.attributes)
            self.root = node
        else:
            parent.add_child(node)

        if not node.leaf:
            new_remaining = remaining[:]
            new_remaining.remove(node.label)
            for value in self.values[node.label]:
                self.create_tree(parent_subset=subset,
                                 parent=node,
                                 parent_value=value,
                                 remaining=new_remaining)
Пример #2
0
    def create_tree(self, parent_subset=None, parent=None, parent_value=None,
                    remaining=None):
        """
        Recursively create the decision tree with the specified subset
        and node positions. Sets the created tree to self.dtree.

        Args:
            parent_subset: the subset of the data of the parent
                to create decision nodes on (defaut None, which is interpreted
                as using entire CSV data). This is further filtered down in the
                body of the function
            parent: the parent of the node to be created (default None, which
                sets the root of the dtree).
            parent_value: the name of the value connecting the parent node and
                the current node (default None).

        """

        # Identify the subset of the data used in the igain calculation
        if parent_subset is None:
            subset = self.data
        else:
            subset = self.filter_subset(parent_subset,
                                        parent.label,
                                        parent_value)

        if remaining is None:
            remaining = self.attributes

        use_parent = False
        counts = self.attr_counts(subset, self.dependent)
        if not counts:
            # Nothing has been found for the given subset. We label the node
            # based on the parent subset instead. This triggers the elif block
            # below
            subset = parent_subset
            counts = self.attr_counts(subset, self.dependent)
            use_parent = True

        # If every element in the subset belongs to one dependent group, label
        # with that group.
        if len(counts) == 1:  # Only one value of self.dependent detected
            node = dtree.DTreeNode(
                label=counts.keys()[0],
                leaf=True,
                parent_value=parent_value
            )
        elif not remaining or use_parent:
            # If there are no remaining attributes, label with the most
            # common attribute in the subset.
            most_common = max(counts, key=lambda k: counts[k])
            node = dtree.DTreeNode(
                label=most_common,
                leaf=True,
                parent_value=parent_value,
                properties={'estimated': True}
            )
        else:
            # Calculate max information gain
            igains = []
            for attr in remaining:
                igains.append((attr, self.information_gain(subset, attr)))

            max_attr = max(igains, key=lambda a: a[1])
            if max_attr[0] == 0:
                # No positive information gain. Select group with most
                # Attributes instead
                distinct_values = []
                for attr in remaining:
                    distinct_values.append(
                        (attr, self.remaining_distinct_values(subset, attr))
                    )
                max_attr = max(distinct_values, key=lambda a: a[1])
                properties = {'max_groups': max_attr[1]}
            else:
                # Use max_attr info gain
                properties = {'information_gain': max_attr[1]}

            # Create the decision tree node
            node = dtree.DTreeNode(
                max_attr[0],
                properties=properties,
                parent_value=parent_value
            )

        if parent is None:
            # Set known order of attributes for dtree decisions
            self.set_attributes(self.attributes)
            self.root = node
        else:
            parent.add_child(node)

        if not node.leaf:  # Continue recursing
            # Remove the just used attribute from the remaining list
            new_remaining = remaining[:]
            new_remaining.remove(node.label)
            for value in self.values[node.label]:
                self.create_tree(
                    parent_subset=subset,
                    parent=node,
                    parent_value=value,
                    remaining=new_remaining
                )