def create_tree(self, parent_subset=None, parent=None, parent_value=None, remaining=None): if parent_subset is None: subset = self.data else: subset = self.filter_subset(parent_subset, parent.label, parent_value) if remaining is None: remaining = self.attributes use_parent = False counts = self.attr_counts(subset, self.dependent) if not counts: subset = parent_subset counts = self.attr_counts(subset, self.dependent) use_parent = True if len(counts) == 1: node = dtree.DTreeNode(label=counts.keys(), leaf=True, parent_value=parent_value) elif not remaining or use_parent: most_common = max(counts, key=lambda k: counts[k]) node = dtree.DTreeNode(label=most_common, leaf=True, parent_value=parent_value, properties={'estimated': True}) else: igains = [] for attr in remaining: igains.append((attr, self.information_gain(subset, attr))) max_attr = max(igains, key=lambda a: a[1]) node = dtree.DTreeNode( max_attr[0], properties={'information_gain': max_attr[1]}, parent_value=parent_value) if parent is None: self.set_attributes(self.attributes) self.root = node else: parent.add_child(node) if not node.leaf: new_remaining = remaining[:] new_remaining.remove(node.label) for value in self.values[node.label]: self.create_tree(parent_subset=subset, parent=node, parent_value=value, remaining=new_remaining)
def create_tree(self, parent_subset=None, parent=None, parent_value=None, remaining=None): """ Recursively create the decision tree with the specified subset and node positions. Sets the created tree to self.dtree. Args: parent_subset: the subset of the data of the parent to create decision nodes on (defaut None, which is interpreted as using entire CSV data). This is further filtered down in the body of the function parent: the parent of the node to be created (default None, which sets the root of the dtree). parent_value: the name of the value connecting the parent node and the current node (default None). """ # Identify the subset of the data used in the igain calculation if parent_subset is None: subset = self.data else: subset = self.filter_subset(parent_subset, parent.label, parent_value) if remaining is None: remaining = self.attributes use_parent = False counts = self.attr_counts(subset, self.dependent) if not counts: # Nothing has been found for the given subset. We label the node # based on the parent subset instead. This triggers the elif block # below subset = parent_subset counts = self.attr_counts(subset, self.dependent) use_parent = True # If every element in the subset belongs to one dependent group, label # with that group. if len(counts) == 1: # Only one value of self.dependent detected node = dtree.DTreeNode( label=counts.keys()[0], leaf=True, parent_value=parent_value ) elif not remaining or use_parent: # If there are no remaining attributes, label with the most # common attribute in the subset. most_common = max(counts, key=lambda k: counts[k]) node = dtree.DTreeNode( label=most_common, leaf=True, parent_value=parent_value, properties={'estimated': True} ) else: # Calculate max information gain igains = [] for attr in remaining: igains.append((attr, self.information_gain(subset, attr))) max_attr = max(igains, key=lambda a: a[1]) if max_attr[0] == 0: # No positive information gain. Select group with most # Attributes instead distinct_values = [] for attr in remaining: distinct_values.append( (attr, self.remaining_distinct_values(subset, attr)) ) max_attr = max(distinct_values, key=lambda a: a[1]) properties = {'max_groups': max_attr[1]} else: # Use max_attr info gain properties = {'information_gain': max_attr[1]} # Create the decision tree node node = dtree.DTreeNode( max_attr[0], properties=properties, parent_value=parent_value ) if parent is None: # Set known order of attributes for dtree decisions self.set_attributes(self.attributes) self.root = node else: parent.add_child(node) if not node.leaf: # Continue recursing # Remove the just used attribute from the remaining list new_remaining = remaining[:] new_remaining.remove(node.label) for value in self.values[node.label]: self.create_tree( parent_subset=subset, parent=node, parent_value=value, remaining=new_remaining )