def _create_tree(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None): class_labels_and_counts = Counter( [instance[target_attribute_index] for instance in instances]) if not instances or not candidate_attribute_indexes: return default_class elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] return class_label else: default_class = simple_ml.majority_value(instances, target_attribute_index) best_index = simple_ml.choose_best_attribute_index( instances, candidate_attribute_indexes, target_attribute_index) tree = {best_index: {}} partitions = simple_ml.split_instances(instances, best_index) remaining_candidate_attribute_indexes = [ i for i in candidate_attribute_indexes if i != best_index ] for attribute_value in partitions: subtree = self._create_tree( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class) tree[best_index][attribute_value] = subtree return tree
def _create_tree(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): class_labels_and_counts = Counter( [instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, # return the default class label if not instances or not candidate_attribute_indexes: if trace: print('{}Using default class {}'.format( '< ' * trace, default_class)) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print('{}All {} instances have label {}'.format( '< ' * trace, len(instances), class_label)) return class_label # Otherwise, create a new subtree and add it to the tree else: default_class = majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = choose_best_attribute_index( instances, candidate_attribute_indexes, target_attribute_index) if trace: print('{}Creating tree node for attribute index {}'.format( '> ' * trace, best_index)) # Create a new decision tree node with the best attribute index # and an empty dictionary object (for now) tree = {best_index: {}} # Create a new decision tree sub-node (branch) # for each of the values in the best attribute field partitions = split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [ i for i in candidate_attribute_indexes if i != best_index ] for attribute_value in partitions: if trace: print('{}Creating subtree for value {} ({}, {}, {}, {})'. format('> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class)) # Create a subtree for each value of the the best attribute subtree = self._create_tree( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class) # Add the new subtree to the empty dictionary object # in the new tree/node created above tree[best_index][attribute_value] = subtree return tree
def _create_tree(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): class_labels_and_counts = Counter([instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, # return the default class label if not instances or not candidate_attribute_indexes: if trace: print('{}Using default class {}'.format('< ' * trace, default_class)) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print('{}All {} instances have label {}'.format( '< ' * trace, len(instances), class_label)) return class_label # Otherwise, create a new subtree and add it to the tree else: default_class = majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = choose_best_attribute_index(instances, candidate_attribute_indexes, target_attribute_index) if trace: print('{}Creating tree node for attribute index {}'.format( '> ' * trace, best_index)) # Create a new decision tree node with the best attribute index # and an empty dictionary object (for now) tree = {best_index:{}} # Create a new decision tree sub-node (branch) # for each of the values in the best attribute field partitions = split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [i for i in candidate_attribute_indexes if i != best_index] for attribute_value in partitions: if trace: print('{}Creating subtree for value {} ({}, {}, {}, {})'.format( '> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class)) # Create a subtree for each value of the the best attribute subtree = self._create_tree( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class) # Add the new subtree to the empty dictionary object # in the new tree/node created above tree[best_index][attribute_value] = subtree return tree
def _create(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): ''' Returns a new decision tree by recursively selecting and splitting instances based on the highest information_gain of the candidate_attribute_indexes. The class label is found in target_attribute_index. The default class is the majority value for that branch of the tree. A positive trace value will generate trace information with increasing levels of indentation. Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach, http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3 ''' instances = instances[:] class_labels_and_counts = Counter([instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, return the default value. if not instances or not candidate_attribute_indexes: if trace: print('{}Using default class {}'.format('< ' * trace, default_class)) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print('{}All {} instances have label {}'.format('< ' * trace, len(instances), class_label)) return class_label else: default_class = simple_ml.majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = simple_ml.choose_best_attribute_index(instances, candidate_attribute_indexes, target_attribute_index) if trace: print('{}Creating tree node for attribute index {}'.format('> ' * trace, best_index)) # Create a new decision tree node with the best attribute index and an empty dictionary object (for now) tree = {best_index:{}} # Create a new decision tree sub-node (branch) for each of the values in the best attribute field partitions = simple_ml.split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [i for i in candidate_attribute_indexes if i != best_index] for attribute_value in partitions: if trace: print('{}Creating subtree for value {} ({}, {}, {}, {})'.format( '> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class)) # Create a subtree for each value of the the best attribute subtree = self._create( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class, trace + 1 if trace else 0) # Add the new subtree to the empty dictionary object in the new tree/node we just created tree[best_index][attribute_value] = subtree return tree
def _create(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): ''' Returns a new decision tree by recursively selecting and splitting instances based on the highest information_gain of the candidate_attribute_indexes. The class label is found in target_attribute_index. The default class is the majority value for that branch of the tree. A positive trace value will generate trace information with increasing levels of indentation. Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach, http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3 ''' instances = instances[:] class_labels_and_counts = Counter( [instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, return the default value. if not instances or not candidate_attribute_indexes: if trace: print '{}Using default class {}'.format( '< ' * trace, default_class) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print '{}All {} instances have label {}'.format( '< ' * trace, len(instances), class_label) return class_label else: default_class = simple_ml.majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = simple_ml.choose_best_attribute_index( instances, candidate_attribute_indexes, target_attribute_index) if trace: print '{}Creating tree node for attribute index {}'.format( '> ' * trace, best_index) # Create a new decision tree node with the best attribute index and an empty dictionary object (for now) tree = {best_index: {}} # Create a new decision tree sub-node (branch) for each of the values in the best attribute field partitions = simple_ml.split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [ i for i in candidate_attribute_indexes if i != best_index ] for attribute_value in partitions: if trace: print '{}Creating subtree for value {} ({}, {}, {}, {})'.format( '> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class) # Create a subtree for each value of the the best attribute subtree = self._create(partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class, trace + 1 if trace else 0) # Add the new subtree to the empty dictionary object in the new tree/node we just created tree[best_index][attribute_value] = subtree return tree