def _rec_build_random_tree(training_data_cut, rec_count): # increase recursion count by 1 rec_count += 1 # find the feature to split the data that provides greatest information gain from a random sample # returns tuple ((feature_name, feature_index), (fc_has_vote, sc_has_vote), (fc_has_not_vote, sc_has_not_vote)) feature_and_votes = _find_best_sampled_feature(training_data_cut) # if training data falls below a preset threshold or the vote is unanimous build a Leaf node; # otherwise split data on feature and build a Tree node; also enforce a recursion limit fc_has_vote = feature_and_votes[1][0] sc_has_vote = feature_and_votes[1][1] fc_has_not_vote = feature_and_votes[2][0] sc_has_not_vote = feature_and_votes[2][1] # length of training data cut cut_length = len(training_data_cut) # build left (has feature) branch if cut_length < _leaf_threshold or fc_has_vote == 0 or sc_has_vote == 0 or rec_count > _rec_limit: # build Leaf based on votes left_branch = DecisionTree.Leaf((fc_has_vote, sc_has_vote)) else: # split out and build Tree has_feature_data = [] for tree_row in training_data_cut: # add 2 to feature index to skip RECORD and CLASS columns feature_index = feature_and_votes[0][1] + 2 if tree_row[feature_index]: has_feature_data.append(tree_row) # recurse into the left branch building the tree of data that has feature left_branch = _rec_build_random_tree(has_feature_data, rec_count) # build right (has not feature) branch if cut_length < _leaf_threshold or fc_has_not_vote == 0 or sc_has_not_vote == 0 or rec_count > _rec_limit: # build Leaf based on votes right_branch = DecisionTree.Leaf((fc_has_not_vote, sc_has_not_vote)) else: # split out and build Tree has_not_feature_data = [] for tree_row in training_data_cut: # add 2 to feature index to skip RECORD and CLASS columns feature_index = feature_and_votes[0][1] + 2 if not tree_row[feature_index]: has_not_feature_data.append(tree_row) # recurse into the right branch building the tree of data without feature right_branch = _rec_build_random_tree(has_not_feature_data, rec_count) # build tree with splitting feature name and index, and the left and right branches feature_name_index = feature_and_votes[0] random_tree = DecisionTree.Tree(feature_name_index, left_branch, right_branch) return random_tree
def dt_learn(dataset, attrs, parent_dist=None): if not dataset: return Dt.Leaf(parent_dist.get_most_common()) dist = Distribution(dataset) if dist.is_leaf() or not attrs: return Dt.Leaf(dist.get_most_common()) else: attr = max_gain(dataset, dist, attrs) tree = Dt.Node(attr) for v in attr.domain: dv = [d for d in dataset if d.x[attr.index] == v] child_attrs = [a for a in attrs if a != attr] subtree = dt_learn(dv, child_attrs, dist) tree.add_child(subtree, v) return tree
def decisionTreeLearning(examples, attributes, parents_examples=()): if len(examples) == 0: return pluralityValue( parents_examples ) #ritorna la piu frequente classificazione tra gli examples elif allSameClass(examples): return DecisionTree.Leaf( examples[0][dataset.target] ) #se tutti hanno la stessa classe ritorna la classe del primo esempio elif len(attributes) == 0: return pluralityValue( examples ) #ritorna la piu frequente classificazione tra gli esempi else: if ce == 0: mostImpAtt, threshold = chooseAttribute(attributes, examples) else: mostImpAtt, threshold = chooseAttribute2(attributes, examples) tree = DecisionTree.DecisionTree(mostImpAtt, threshold, dataset.attrnames[mostImpAtt]) ExampleMinor, ExampleMajor = splittingOnThreshold( mostImpAtt, threshold, examples) #separazione basata sulla soglia #fa la ricorsione ed aggiunge all albero branchesLeft = decisionTreeLearning(ExampleMinor, removeAttr( mostImpAtt, attributes), examples) #ricorsione branchesRight = decisionTreeLearning(ExampleMajor, removeAttr( mostImpAtt, attributes), examples) #ricorsione tree.addLeft(threshold, branchesLeft) tree.addRight(threshold, branchesRight) return tree
def decisionTreeLearning(examples, attributes, parents_examples=()): if len(examples) == 0: return pluralityValue( parents_examples ) #returns the most frequent classification among the examples elif allSameClass(examples): return DecisionTree.Leaf( examples[0][dataset.target] ) #if they all have the same class, I return the class of the first example elif len(attributes) == 0: return pluralityValue( examples ) #returns the most frequent classification among the examples else: mostImpAtt, threshold = chooseAttribute(attributes, examples) tree = DecisionTree.DecisionTree(mostImpAtt, threshold, dataset.attrnames[mostImpAtt]) ExampleMinor, ExampleMajor = splittingOnThreshold( mostImpAtt, threshold, examples) #separate based on threshold #do recursion and add to the tree branchesLeft = decisionTreeLearning(ExampleMinor, removeAttr( mostImpAtt, attributes), examples) #recursion branchesRight = decisionTreeLearning(ExampleMajor, removeAttr( mostImpAtt, attributes), examples) #recursion tree.addLeft(threshold, branchesLeft) tree.addRight(threshold, branchesRight) return tree
def pluralityValue(examples): i = 0 global popular for v in dataset.values: #per ogni classificazione conta le occorrenze, poi scegli la piu popular count = counting(dataset.target, v, examples) if count > i: i = count popular = v return DecisionTree.Leaf(popular)
def pluralityValue(examples): i = 0 global popular for v in dataset.values: #for each classification count the occurrences. Then choose the most popular count = counting(dataset.target, v, examples) if count > i: i = count popular = v return DecisionTree.Leaf(popular)