def _create_tree(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None): class_labels_and_counts = Counter( [instance[target_attribute_index] for instance in instances]) if not instances or not candidate_attribute_indexes: return default_class elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] return class_label else: default_class = simple_ml.majority_value(instances, target_attribute_index) best_index = simple_ml.choose_best_attribute_index( instances, candidate_attribute_indexes, target_attribute_index) tree = {best_index: {}} partitions = simple_ml.split_instances(instances, best_index) remaining_candidate_attribute_indexes = [ i for i in candidate_attribute_indexes if i != best_index ] for attribute_value in partitions: subtree = self._create_tree( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class) tree[best_index][attribute_value] = subtree return tree
def _create_tree(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): class_labels_and_counts = Counter( [instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, # return the default class label if not instances or not candidate_attribute_indexes: if trace: print('{}Using default class {}'.format( '< ' * trace, default_class)) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print('{}All {} instances have label {}'.format( '< ' * trace, len(instances), class_label)) return class_label # Otherwise, create a new subtree and add it to the tree else: default_class = majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = choose_best_attribute_index( instances, candidate_attribute_indexes, target_attribute_index) if trace: print('{}Creating tree node for attribute index {}'.format( '> ' * trace, best_index)) # Create a new decision tree node with the best attribute index # and an empty dictionary object (for now) tree = {best_index: {}} # Create a new decision tree sub-node (branch) # for each of the values in the best attribute field partitions = split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [ i for i in candidate_attribute_indexes if i != best_index ] for attribute_value in partitions: if trace: print('{}Creating subtree for value {} ({}, {}, {}, {})'. format('> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class)) # Create a subtree for each value of the the best attribute subtree = self._create_tree( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class) # Add the new subtree to the empty dictionary object # in the new tree/node created above tree[best_index][attribute_value] = subtree return tree
def _create(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): ''' Returns a new decision tree by recursively selecting and splitting instances based on the highest information_gain of the candidate_attribute_indexes. The class label is found in target_attribute_index. The default class is the majority value for that branch of the tree. A positive trace value will generate trace information with increasing levels of indentation. Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach, http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3 ''' instances = instances[:] class_labels_and_counts = Counter([instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, return the default value. if not instances or not candidate_attribute_indexes: if trace: print('{}Using default class {}'.format('< ' * trace, default_class)) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print('{}All {} instances have label {}'.format('< ' * trace, len(instances), class_label)) return class_label else: default_class = simple_ml.majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = simple_ml.choose_best_attribute_index(instances, candidate_attribute_indexes, target_attribute_index) if trace: print('{}Creating tree node for attribute index {}'.format('> ' * trace, best_index)) # Create a new decision tree node with the best attribute index and an empty dictionary object (for now) tree = {best_index:{}} # Create a new decision tree sub-node (branch) for each of the values in the best attribute field partitions = simple_ml.split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [i for i in candidate_attribute_indexes if i != best_index] for attribute_value in partitions: if trace: print('{}Creating subtree for value {} ({}, {}, {}, {})'.format( '> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class)) # Create a subtree for each value of the the best attribute subtree = self._create( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class, trace + 1 if trace else 0) # Add the new subtree to the empty dictionary object in the new tree/node we just created tree[best_index][attribute_value] = subtree return tree
def _create_tree(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): class_labels_and_counts = Counter([instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, # return the default class label if not instances or not candidate_attribute_indexes: if trace: print('{}Using default class {}'.format('< ' * trace, default_class)) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print('{}All {} instances have label {}'.format( '< ' * trace, len(instances), class_label)) return class_label # Otherwise, create a new subtree and add it to the tree else: default_class = majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = choose_best_attribute_index(instances, candidate_attribute_indexes, target_attribute_index) if trace: print('{}Creating tree node for attribute index {}'.format( '> ' * trace, best_index)) # Create a new decision tree node with the best attribute index # and an empty dictionary object (for now) tree = {best_index:{}} # Create a new decision tree sub-node (branch) # for each of the values in the best attribute field partitions = split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [i for i in candidate_attribute_indexes if i != best_index] for attribute_value in partitions: if trace: print('{}Creating subtree for value {} ({}, {}, {}, {})'.format( '> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class)) # Create a subtree for each value of the the best attribute subtree = self._create_tree( partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class) # Add the new subtree to the empty dictionary object # in the new tree/node created above tree[best_index][attribute_value] = subtree return tree
def _create(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0, max_height=None, min_support=0, epsilon=1.0, parent_id=-1, parent_value=None, parent_node=None): ''' Returns a new decision tree by recursively selecting and splitting instances based on the highest information_gain of the candidate_attribute_indexes. The class label is found in target_attribute_index. The default class is the majority value for that branch of the tree. A positive trace value will generate trace information with increasing levels of indentation. max_height is the maximum levels the tree can have. Assume trace is non-zero. min_support is the minimum number of records needed to make a split. Otherwise the node becomes a leaf. epsilon budget Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach, http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3 ''' instances = instances[:] self._id_tracker += 1 class_labels_and_counts = dict( Counter( [instance[target_attribute_index] for instance in instances])) #print( "class_labels_and_counts = "+str(class_labels_and_counts) ) if epsilon is not None: class_labels_and_counts = simple_ml.add_laplace_noise( class_labels_and_counts, 1., epsilon) # sensitivity = 1 #print( "new class_labels_counts = "+str(class_labels_and_counts) ) partitionSize = 0 for k, v in class_labels_and_counts.items(): partitionSize += v #print("noisy partitionSize = "+str(partitionSize)) class_label = Counter(class_labels_and_counts).most_common(1)[0][0] # If the dataset is empty or the candidate attributes list is empty, return the default value. if partitionSize == 0: #if trace: # print( '{}Using default class {}'.format('< ' * trace, default_class) ) return node.Node(self._id_tracker, trace, None, "_Leaf", parent_id, parent_value, parent_node, {default_class: 1}, children=None) #default_class # If the dataset is empty or the candidate attributes list is empty, return the default value. elif not candidate_attribute_indexes: #if trace: # print( '{}Using default class {}'.format('< ' * trace, default_class) ) return node.Node(self._id_tracker, trace, None, "_Leaf", parent_id, parent_value, parent_node, class_labels_and_counts, children=None) #default_class # If all the records have the same class label, return that class label elif len(Counter(class_labels_and_counts)) == 1: #if trace: # print( '{}All {} records have label {}'.format('< ' * trace, partitionSize, class_label) ) return node.Node(self._id_tracker, trace, None, "_Leaf", parent_id, parent_value, parent_node, class_labels_and_counts, children=None) #class_label # If there aren't enough records in the node to make another split, return the majority class label elif partitionSize < min_support: #if trace: # print( '{} {} records is below the minimum support required for more splits. The majority label is {}'.format('< ' * trace, partitionSize, class_label) ) return node.Node(self._id_tracker, trace, None, "_Leaf", parent_id, parent_value, parent_node, class_labels_and_counts, children=None) #class_label # If the tree has reached the maximum number of levels (depth), return the majority class label. Assumes trace is non-zero. elif trace >= max_height: #if trace: # print( '{}The maximum tree depth has been reached. The {} records in this leaf have majority label {}'.format('< ' * trace, partitionSize, class_label) ) return node.Node(self._id_tracker, trace, None, "_Leaf", parent_id, parent_value, parent_node, class_labels_and_counts, children=None) #class_label # MAKE MORE SPLITS default_class = class_label #simple_ml.majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the records worst_case_sens = 1. - ((partitionSize / (partitionSize + 1))**2 + (1 / (partitionSize + 1))**2) #print( "worst_case_sens="+str(worst_case_sens) ) if trace == 1: # if root node candi = [ i for i in candidate_attribute_indexes if i not in self._previous_roots ] best_index = simple_ml.choose_best_attribute_index( instances, candi, target_attribute_index, epsilon=epsilon, sensitivity=worst_case_sens) else: best_index = simple_ml.choose_best_attribute_index( instances, candidate_attribute_indexes, target_attribute_index, epsilon=epsilon, sensitivity=worst_case_sens) #if trace: # print( '{}Creating tree node for attribute index {}'.format('> ' * trace, best_index) ) # Create a new decision tree node with the best attribute index and an empty dictionary object (for now) #tree = {best_index:{}} current_node = node.Node(self._id_tracker, trace, best_index, self._attribute_names[best_index], parent_id, parent_value, parent_node, class_labels_and_counts, children=[]) # Create a new decision tree sub-node (branch) for each of the values in the best attribute field partitions = simple_ml.split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [ i for i in candidate_attribute_indexes if i != best_index ] ''' For every value in the chosen attribute, make a subtree ''' tracecopy = trace + 1 curr_id = self._id_tracker for attribute_value in partitions: #if trace: # print( '{}Creating subtree for value {} ({}, {}, {}, {})'.format( # '> ' * trace, # attribute_value, # len(partitions[attribute_value]), # len(remaining_candidate_attribute_indexes), # target_attribute_index, # default_class) # ) # Create a subtree for each value of the the best attribute subtree = self._create(partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class, tracecopy if trace else 0, max_height, min_support, epsilon, curr_id, attribute_value, current_node) # Add the new subtree to the empty dictionary object in the new tree/node we just created #tree[best_index][attribute_value] = subtree current_node.add_child(subtree) self._node_list.append(subtree) #print('.', end='') return current_node #tree
def _create(self, instances, candidate_attribute_indexes, target_attribute_index=0, default_class=None, trace=0): ''' Returns a new decision tree by recursively selecting and splitting instances based on the highest information_gain of the candidate_attribute_indexes. The class label is found in target_attribute_index. The default class is the majority value for that branch of the tree. A positive trace value will generate trace information with increasing levels of indentation. Derived from the simplified ID3 algorithm presented in Building Decision Trees in Python by Christopher Roach, http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=3 ''' instances = instances[:] class_labels_and_counts = Counter( [instance[target_attribute_index] for instance in instances]) # If the dataset is empty or the candidate attributes list is empty, return the default value. if not instances or not candidate_attribute_indexes: if trace: print '{}Using default class {}'.format( '< ' * trace, default_class) return default_class # If all the instances have the same class label, return that class label elif len(class_labels_and_counts) == 1: class_label = class_labels_and_counts.most_common(1)[0][0] if trace: print '{}All {} instances have label {}'.format( '< ' * trace, len(instances), class_label) return class_label else: default_class = simple_ml.majority_value(instances, target_attribute_index) # Choose the next best attribute index to best classify the instances best_index = simple_ml.choose_best_attribute_index( instances, candidate_attribute_indexes, target_attribute_index) if trace: print '{}Creating tree node for attribute index {}'.format( '> ' * trace, best_index) # Create a new decision tree node with the best attribute index and an empty dictionary object (for now) tree = {best_index: {}} # Create a new decision tree sub-node (branch) for each of the values in the best attribute field partitions = simple_ml.split_instances(instances, best_index) # Remove that attribute from the set of candidates for further splits remaining_candidate_attribute_indexes = [ i for i in candidate_attribute_indexes if i != best_index ] for attribute_value in partitions: if trace: print '{}Creating subtree for value {} ({}, {}, {}, {})'.format( '> ' * trace, attribute_value, len(partitions[attribute_value]), len(remaining_candidate_attribute_indexes), target_attribute_index, default_class) # Create a subtree for each value of the the best attribute subtree = self._create(partitions[attribute_value], remaining_candidate_attribute_indexes, target_attribute_index, default_class, trace + 1 if trace else 0) # Add the new subtree to the empty dictionary object in the new tree/node we just created tree[best_index][attribute_value] = subtree return tree