def build_tree(qs, field, scoref=entropy, ignore_fields=None, include_fields=None): """Build a classification decision tree >>> from pug.nlp.data.tobes_example import tobes_data >>> print_tree(build_tree(tobes_data)) # doctest: +NORMALIZE_WHITESPACE 0:google? T-> 3:21? T-> {'Premium': 3} F-> 2:yes? T-> {'Basic': 1} F-> {'None': 1} F-> 0:slashdot? T-> {'None': 3} F-> 2:yes? T-> {'Basic': 4} F-> 3:21? T-> {'Basic': 1} F-> {'None': 3} """ if ignore_fields is None: ignore_fields = ('pk', 'id') N = qs.count() if not N: return DecisionNode() if include_fields is None: include_fields = qs[0]._meta.get_all_field_names() current_score=scoref(qs, field) # Set up some variables to track the best criteria best_gain=0.0 best_criteria=None best_sets=None for col in include_fields: if col in ignore_fields or col == field: continue # Set of unique values in this column # TODO: should do this once for all columns and cache it somewhere column_values = count_unique(qs, col) # Try dividing the table up for each value in this column for value in column_values: (set1, set2) = divide(qs, field=col, target=value, ignore_fields=ignore_fields, include_fields=include_fields) # Information improvement p = float(set1.count()) / N gain = current_score - p * scoref(set1, field) - (1 - p) * scoref(set2, field) if gain > best_gain and set1.count() > 0 and set2.count() > 0: best_gain = gain best_criteria = (col, value) best_sets = (set1, set2) # Create the sub branches if best_gain > 0: trueBranch = build_tree(best_sets[0], field, ignore_fields=ignore_fields, include_fields=include_fields) falseBranch = build_tree(best_sets[1], field, ignore_fields=ignore_fields, include_fields=include_fields) return DecisionNode(col=best_criteria[0], value=best_criteria[1], tb=trueBranch, fb=falseBranch) else: return DecisionNode(results=count_unique(qs, field=field))
def build_tree(qs, field, scoref=entropy, ignore_fields=None, include_fields=None): """Build a classification decision tree >>> print_tree(build_tree(tobes_data)) # doctest: +NORMALIZE_WHITESPACE 0:google? T-> 3:21? T-> {'Premium': 3} F-> 2:yes? T-> {'Basic': 1} F-> {'None': 1} F-> 0:slashdot? T-> {'None': 3} F-> 2:yes? T-> {'Basic': 4} F-> 3:21? T-> {'Basic': 1} F-> {'None': 3} """ if ignore_fields is None: ignore_fields = ('pk', 'id') N = qs.count() if not N: return DecisionNode() if include_fields is None: include_fields = qs[0]._meta.get_all_field_names() current_score=scoref(qs, field) # Set up some variables to track the best criteria best_gain=0.0 best_criteria=None best_sets=None for col in include_fields: if col in ignore_fields or col == field: continue # Set of unique values in this column # TODO: should do this once for all columns and cache it somewhere column_values = count_unique(qs, col) # Try dividing the table up for each value in this column for value in column_values: (set1, set2) = divide(qs, field=col, target=value, ignore_fields=ignore_fields, include_fields=include_fields) # Information improvement p = float(set1.count()) / N gain = current_score - p * scoref(set1, field) - (1 - p) * scoref(set2, field) if gain > best_gain and set1.count() > 0 and set2.count() > 0: best_gain = gain best_criteria = (col, value) best_sets = (set1, set2) # Create the sub branches if best_gain > 0: trueBranch = build_tree(best_sets[0], field, ignore_fields=ignore_fields, include_fields=include_fields) falseBranch = build_tree(best_sets[1], field, ignore_fields=ignore_fields, include_fields=include_fields) return DecisionNode(col=best_criteria[0], value=best_criteria[1], tb=trueBranch, fb=falseBranch) else: return DecisionNode(results=count_unique(qs, field=field))
def entropy_and_impurity(qs, field, num_categories=2): """Gini impurity evaluation of predicted segmentation/categorization Returns a tuple of the entropy (in nats, base e bits) and the impurity ( a probability between 0 and 1 inclusive) Impurity is the probability or frequency with which the *wrong* category or prediction is assigned to an element. >>> from pug.nlp.data.tobes_example import tobes_data >>> entropy_and_impurity(tobes_data, -1) # doctest: +ELLIPSIS (1.50524..., 0.6328125) """ from math import log N = qs.count() counts = count_unique(qs, field) impurity = 0.0 entropy = 0.0 for k1 in counts: p1 = float(counts[k1]) / N if p1: entropy -= p1 * log(p1, num_categories) for k2 in counts: if not k1 == k2: p2 = float(counts[k2]) / N impurity += p1 * p2 return entropy, impurity
def gini_impurity(qs, field): '''Gini impurity evaluation of set of values Returns the probability [0, 1], that the wrong category/prediction has been assigned. ''' N = qs.count() counts = count_unique(qs, field) impurity = 0.0 for k1 in counts: p1 = float(counts[k1]) / N for k2 in counts: if not k1 == k2: p2 = float(counts[k2]) / N impurity += p1 * p2 return impurity
def entropy(qs, field, num_categories=2): """Total entropy (in nats, base e bits) for all the categorizations assigned sum(p(x) * log(p(x)) for x in count_unique(qs, field) Which measures how different each categorization (segmentation) is from the others """ from math import log counts = count_unique(qs, field) ans = 0.0 N = qs.count() for k in counts: p = float(counts[k]) / N if p: ans -= p * log(p, num_categories) return ans
def entropy(qs, field, num_categories=2): """Total entropy for all the categorizations assigned sum(p(x) * log(p(x)) for x in count_unique(qs, field) Which measures how different each categorization is from the others """ from math import log counts = count_unique(qs, field) ans = 0.0 N = qs.count() for k in counts: p = float(counts[k]) / N if p: ans -= p * log(p, num_categories) return ans
def entropy_and_impurity(qs, field, num_categories=2): """Gini impurity evaluation of predictions Returns the probability [0, 1], that the wrong category/prediction has been assigned. >>> entropy_and_impurity(tobes_data, -1) # doctest: +ELLIPSIS (1.50524..., 0.6328125) """ from math import log N = qs.count() counts = count_unique(qs, field) impurity = 0.0 entropy = 0.0 for k1 in counts: p1 = float(counts[k1]) / N if p1: entropy -= p1 * log(p1, num_categories) for k2 in counts: if not k1 == k2: p2 = float(counts[k2]) / N impurity += p1 * p2 return entropy, impurity
def prune(tree, mingain): # If the branches aren't leaves, then prune them if tree.tb.results == None: prune(tree.tb, mingain) if tree.fb.results == None: prune(tree.fb, mingain) # If both the subbranches are now leaves, see if they # should merged if tree.tb.results != None and tree.fb.results != None: # Build a combined dataset tb, fb = [],[] for v, c in tree.tb.results.items(): tb += [[v]] * c for v, c in tree.fb.results.items(): fb += [[v]] * c # Test the reduction in entropy delta = dt.entropy(tb + fb) - (dt.entropy(tb) + dt.entropy(fb) / 2) if delta < mingain: # Merge the branches tree.tb, tree.fb = None, None tree.results = count_unique(tb + fb)
def prune(tree, mingain): # If the branches aren't leaves, then prune them if tree.tb.results == None: prune(tree.tb, mingain) if tree.fb.results == None: prune(tree.fb, mingain) # If both the subbranches are now leaves, see if they # should merged if tree.tb.results != None and tree.fb.results != None: # Build a combined dataset tb, fb = [], [] for v, c in tree.tb.results.items(): tb += [[v]] * c for v, c in tree.fb.results.items(): fb += [[v]] * c # Test the reduction in entropy delta = dt.entropy(tb + fb) - (dt.entropy(tb) + dt.entropy(fb) / 2) if delta < mingain: # Merge the branches tree.tb, tree.fb = None, None tree.results = count_unique(tb + fb)
from call_center.models import CaseExchange, CaseHDTVHeader, CaseMaster from pug.db.explore import count_unique from pug.nlp.db_decision_tree import build_tree, print_tree N = CaseMaster.objects.count() UN = CaseMaster.objects.values('case_number').distinct().count() N_ce = CaseExchange.objects.count() UN_ce = CaseExchange.objects.values('case_number').distinct().count() N_hdtv = CaseHDTVHeader.objects.count() UN_hdtv = CaseHDTVHeader.objects.values('case_number').distinct().count() un = count_unique(CaseExchange.objects.values('case_number'), 'case_number') assert(len(un.values()) == UN_ce) assert(sum(un.values()) == N_ce) qs = CaseHDTVHeader.objects.filter(case_number__lt=4000000) ex = qs.all()[0] ex.service_calls print_tree(build_tree(qs, field='service_calls', ignore_fields=('id', 'case_number'))) # dispatch_status:Completed ? # T-> date_time:2008-09-15 12:25:34.270000? # T-> {1: 1} # F-> date_time:2008-07-09 08:49:36.437000? # T-> {0: 1} # F-> {None: 0} # F-> {None: 0}
#test_decider.py from call_center.models import CaseExchange, CaseHDTVHeader, CaseMaster from pug.db.explore import count_unique from pug.nlp.db_decision_tree import build_tree, print_tree N = CaseMaster.objects.count() UN = CaseMaster.objects.values('case_number').distinct().count() N_ce = CaseExchange.objects.count() UN_ce = CaseExchange.objects.values('case_number').distinct().count() N_hdtv = CaseHDTVHeader.objects.count() UN_hdtv = CaseHDTVHeader.objects.values('case_number').distinct().count() un = count_unique(CaseExchange.objects.values('case_number'), 'case_number') assert (len(un.values()) == UN_ce) assert (sum(un.values()) == N_ce) qs = CaseHDTVHeader.objects.filter(case_number__lt=4000000) ex = qs.all()[0] ex.service_calls print_tree( build_tree(qs, field='service_calls', ignore_fields=('id', 'case_number'))) # dispatch_status:Completed ? # T-> date_time:2008-09-15 12:25:34.270000? # T-> {1: 1} # F-> date_time:2008-07-09 08:49:36.437000? # T-> {0: 1} # F-> {None: 0} # F-> {None: 0} qs = CaseHDTVHeader.objects.filter(case_number__lt=2000000)