def normalizer(D, A): sum = 0.0 for v in D.get_attributeValues(A): D_j = Dataset(None, None, None, D, A, v) pr_A = D_j.get_dataSize() / D.get_dataSize() sum += pr_A * math.log(pr_A, 2) sum = -1 * sum #print('Normalizer: %.3f' % (sum)) return sum
def entropy(D, A=None): sum = 0.0 if A: for v_j in D.get_attributeValues(A): D_j = Dataset(None, None, None, D, A, v_j) sum += D_j.get_dataSize() / D.get_dataSize() * entropy(D_j) #print('Entropy[%s]: %.3f' % (A, sum)) else: for c_j in D.get_classes(): if D.pr_c(c_j): sum += D.pr_c(c_j) * math.log(D.pr_c(c_j), 2) sum = -1 * sum #print('Entropy: %.3f' % (sum)) return sum
def decision_tree_rec(self, D, A, T, threshold): assert D.get_numClasses() > 0 if D.get_numClasses() == 1: print('make T a leaf node with labeled with c'); classification = D.get_classes().pop() print('F**K') print(classification) num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification) decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('num', num) decision.set('choice', choice) elif len(A) == 0: print('make T a leaf node labeled with the most frequent class') classification = D.get_mostPluralClass() num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification) decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('num', num) decision.set('choice', choice) else: print('contains examples belonging to a mixture of classes') A_split = self.select_splitting_attribute(D, A, threshold) print('SPLITTING ON %s: ', A_split) #A_split = select_splitting_attribute_ratio(D, A, threshold) #print('SPLITTING ON RATIO %s: ', A_split) if A_split == None: decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('choice', D.get_mostPluralClass()) else: node = ElementTree.SubElement(T, 'node') node.set('var', A_split) AminusA_split = set() for a in A: if a != A: AminusA_split.add(a) for v in D.get_attributeValues(A_split): D_v = Dataset(None, None, None, D, A_split, v) if D_v.get_dataSize() > 0: edge = ElementTree.SubElement(node, 'edge') num, var = D_v.get_num_choice_tuple(A_split, v) edge.set('var', var) edge.set('num', num) self.decision_tree_rec(D_v, AminusA_split, edge, threshold) else: print('IGNORING %s' % str(v))
def __init__(self, domain_filename, csv_filename, restrictions_filename = None, ratio=False ): print('~' * 20) print(csv_filename) print(restrictions_filename) print(ratio) self.tree = None self.tree_name = csv_filename[:-4] self.tree_name = os.path.splitext(csv_filename)[0] self.restricted = False if restrictions_filename: print('F*****G RESTRICTED') self.restricted = True else: print('NOT RESTRICTED?!?! %s' % restrictions_filename) print('~' * 20) self.select_splitting_attribute = select_splitting_attribute_default if ratio: self.select_splitting_attribute = select_splitting_attribute_ratio self.trainingSet = Dataset(domain_filename, csv_filename, restrictions_filename)
class DecisionTreeBuilder(object): def __init__(self, domain_filename, csv_filename, restrictions_filename = None, ratio=False ): print('~' * 20) print(csv_filename) print(restrictions_filename) print(ratio) self.tree = None self.tree_name = csv_filename[:-4] self.tree_name = os.path.splitext(csv_filename)[0] self.restricted = False if restrictions_filename: print('F*****G RESTRICTED') self.restricted = True else: print('NOT RESTRICTED?!?! %s' % restrictions_filename) print('~' * 20) self.select_splitting_attribute = select_splitting_attribute_default if ratio: self.select_splitting_attribute = select_splitting_attribute_ratio self.trainingSet = Dataset(domain_filename, csv_filename, restrictions_filename) def decision_tree_rec(self, D, A, T, threshold): assert D.get_numClasses() > 0 if D.get_numClasses() == 1: print('make T a leaf node with labeled with c'); classification = D.get_classes().pop() print('F**K') print(classification) num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification) decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('num', num) decision.set('choice', choice) elif len(A) == 0: print('make T a leaf node labeled with the most frequent class') classification = D.get_mostPluralClass() num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification) decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('num', num) decision.set('choice', choice) else: print('contains examples belonging to a mixture of classes') A_split = self.select_splitting_attribute(D, A, threshold) print('SPLITTING ON %s: ', A_split) #A_split = select_splitting_attribute_ratio(D, A, threshold) #print('SPLITTING ON RATIO %s: ', A_split) if A_split == None: decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('choice', D.get_mostPluralClass()) else: node = ElementTree.SubElement(T, 'node') node.set('var', A_split) AminusA_split = set() for a in A: if a != A: AminusA_split.add(a) for v in D.get_attributeValues(A_split): D_v = Dataset(None, None, None, D, A_split, v) if D_v.get_dataSize() > 0: edge = ElementTree.SubElement(node, 'edge') num, var = D_v.get_num_choice_tuple(A_split, v) edge.set('var', var) edge.set('num', num) self.decision_tree_rec(D_v, AminusA_split, edge, threshold) else: print('IGNORING %s' % str(v)) def build_tree(self, threshold): self.tree = Element('Tree') self.tree.set('name',self.tree_name) allAttributes = self.trainingSet.get_attributes() self.decision_tree_rec(self.trainingSet, allAttributes, self.tree, threshold) return self.tree def get_tree(self): return self.tree def get_xml(self, indent=' '): return minidom.parseString( ElementTree.tostring(self.tree)).toprettyxml(indent=indent) def print_tree(self, file=sys.stdout, indent=' '): xml_str = self.get_xml(indent) print(xml_str, file=file) return xml_str def save_tree(self, file=None, indent=' '): if file: return self.print_tree(file, indent) else: xml_filename = self.tree_name if self.restricted: xml_filename += '_restricted' xml_filename += '.xml' with open(self.tree_name + '.xml', 'w') as save_file: xml_str = self.print_tree(save_file, indent) return xml_str