def entropy(table, field=-1, num_categories=2): """Total entropy for all the categorizations assigned sum(p(x) * log(p(x)) for x in count_unique(table, field) Which measures how different each categorization is from the others """ from math import log counts = count_unique(table, field) entropy = 0.0 try: N = table.count() except: N = len(table) for k in counts: p = float(counts[k]) / N entropy -= p * log(p, num_categories) return entropy
def gini_impurity(table, field=-1): """Gini impurity evaluation of predictions Returns the probability [0, 1], that the wrong category/prediction has been assigned. """ try: N = table.count() except: N = len(table) counts = count_unique(table, field) impurity = 0.0 for k1 in counts: p1 = float(counts[k1]) / N for k2 in counts: if not k1 == k2: p2 = float(counts[k2]) / N impurity += p1 * p2 return impurity
def entropy_and_impurity(table, field=-1, num_categories=2): """Gini impurity evaluation of predictions Returns the probability [0, 1], that the wrong category/prediction has been assigned. >>> entropy_and_impurity(tobes_data, -1) # doctest: +ELLIPSIS (1.50524..., 0.6328125) """ from math import log try: N = table.count() except: N = len(table) counts = count_unique(table, field) impurity = 0.0 entropy = 0.0 for k1 in counts: p1 = float(counts[k1]) / N entropy -= p1 * log(p1, num_categories) for k2 in counts: if not k1 == k2: p2 = float(counts[k2]) / N impurity += p1 * p2 return entropy, impurity
def build_tree(table, field=-1, scoref=entropy, ignore_fields=('pk', 'id')): """Build a classification decision tree >>> print_tree(build_tree(tobes_data)) # doctest: +NORMALIZE_WHITESPACE 0:google? T-> 3:21? T-> {'Premium': 3} F-> 2:yes? T-> {'Basic': 1} F-> {'None': 1} F-> 0:slashdot? T-> {'None': 3} F-> 2:yes? T-> {'Basic': 4} F-> 3:21? T-> {'Basic': 1} F-> {'None': 3} """ try: N = len(table) except: try: N = table.count() except: N = 0 if not N: return DecisionNode() current_score = scoref(table) # Set up some variables to track the best criteria best_gain = 0.0 best_criteria = None best_sets = None if isinstance(table[0], dict) and isinstance(field, int): keys = sorted(tuple(k for k in table[0] if k not in ignore_fields)) M = len(keys) else: M = len(table[0]) keys = range(M) keyed_field = keys[field] for col in range(M): keyed_col = keys[col] if keyed_col == keyed_field: continue # Generate the list of different values in # this column column_values = set() for row in table: column_values.add(get(row, keyed_col)) # Try dividing the table up for each value in this column for value in column_values: (set1, set2) = divide(table, field=col, target=value) set1, set2 = tuple(set1), tuple(set2) # Information improvement p = float(len(set1)) / N gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2) if gain > best_gain and len(set1) > 0 and len(set2) > 0: best_gain = gain best_criteria = (col, value) best_sets = (set1, set2) # Create the sub branches if best_gain > 0: trueBranch = build_tree(best_sets[0]) falseBranch = build_tree(best_sets[1]) return DecisionNode(col=best_criteria[0], value=best_criteria[1], tb=trueBranch, fb=falseBranch) else: return DecisionNode(results=count_unique( table, field=keyed_field)) #keyed_field
def build_tree(table, field=-1, scoref=entropy, ignore_fields=('pk', 'id')): """Build a classification decision tree >>> print_tree(build_tree(tobes_data)) # doctest: +NORMALIZE_WHITESPACE 0:google? T-> 3:21? T-> {'Premium': 3} F-> 2:yes? T-> {'Basic': 1} F-> {'None': 1} F-> 0:slashdot? T-> {'None': 3} F-> 2:yes? T-> {'Basic': 4} F-> 3:21? T-> {'Basic': 1} F-> {'None': 3} """ try: N = len(table) except: try: N = table.count() except: N = 0 if not N: return DecisionNode() current_score=scoref(table) # Set up some variables to track the best criteria best_gain=0.0 best_criteria=None best_sets=None if isinstance(table[0], dict) and isinstance(field, int): keys = sorted(tuple(k for k in table[0] if k not in ignore_fields)) M = len(keys) else: M = len(table[0]) keys = range(M) keyed_field = keys[field] for col in range(M): keyed_col = keys[col] if keyed_col == keyed_field: continue # Generate the list of different values in # this column column_values = set() for row in table: column_values.add(get(row, keyed_col)) # Try dividing the table up for each value in this column for value in column_values: (set1, set2) = divide(table, field=col, target=value) set1, set2 = tuple(set1), tuple(set2) # Information improvement p = float(len(set1)) / N gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2) if gain > best_gain and len(set1) > 0 and len(set2) > 0: best_gain = gain best_criteria = (col, value) best_sets = (set1, set2) # Create the sub branches if best_gain > 0: trueBranch = build_tree(best_sets[0]) falseBranch = build_tree(best_sets[1]) return DecisionNode(col=best_criteria[0], value=best_criteria[1], tb=trueBranch, fb=falseBranch) else: return DecisionNode(results=count_unique(table, field=keyed_field)) #keyed_field