qs_kwargs_list = [ # quick test (small portion of database) {'id__gt':0, 'id__lt': 1000}, ] # each row is a different tree in the forest include_fields_list = [ ['wikiitem__modified', 'wikiitem__title'], ['wikiitem__modified', 'wikiitem__title'], ] for i, field in enumerate(fields): for j, qs_kwargs in enumerate(qs_kwargs_list): for k, include_fields in enumerate(include_fields_list): print print '=' * 80 print "Attempt to predict: %s" % field print "Limit database to: %s" % qs_kwargs print "Indicator variables: %s" % include_fields qs = WikiItem.objects.filter(**qs_kwargs) print "Fitting to %s records." % qs.count() print '-' * 80 tree = build_tree(qs, field=field, include_fields=include_fields + [field]) tree_list += [tree] print_tree(tree) print '-' * 80 draw_tree(tree, 'tree_%s_%s_%s.jpg' % (i, j, k)) with open('tree_%s_%s_%s.pickle' % (i, j, k), 'wb') as fpout: pickle.dump(tree, fpout) with open('tree_%s_%s_%s.txt' % (i, j, k), 'wb') as fpout: fpout.write(represent_tree(tree))
UN = CaseMaster.objects.values('case_number').distinct().count() N_ce = CaseExchange.objects.count() UN_ce = CaseExchange.objects.values('case_number').distinct().count() N_hdtv = CaseHDTVHeader.objects.count() UN_hdtv = CaseHDTVHeader.objects.values('case_number').distinct().count() un = count_unique(CaseExchange.objects.values('case_number'), 'case_number') assert(len(un.values()) == UN_ce) assert(sum(un.values()) == N_ce) qs = CaseHDTVHeader.objects.filter(case_number__lt=4000000) ex = qs.all()[0] ex.service_calls print_tree(build_tree(qs, field='service_calls', ignore_fields=('id', 'case_number'))) # dispatch_status:Completed ? # T-> date_time:2008-09-15 12:25:34.270000? # T-> {1: 1} # F-> date_time:2008-07-09 08:49:36.437000? # T-> {0: 1} # F-> {None: 0} # F-> {None: 0} qs = CaseHDTVHeader.objects.filter(case_number__lt=2000000) ex = qs.all()[0] ex.service_calls print_tree(build_tree(qs, field='dispatch_status', ignore_fields=('id', 'case_number', 'service_calls'))) # dispatch_status:Completed ? # T-> date_time:2008-09-15 12:25:34.270000? # T-> {1: 1} # F-> date_time:2008-07-09 08:49:36.437000?
N = CaseMaster.objects.count() UN = CaseMaster.objects.values('case_number').distinct().count() N_ce = CaseExchange.objects.count() UN_ce = CaseExchange.objects.values('case_number').distinct().count() N_hdtv = CaseHDTVHeader.objects.count() UN_hdtv = CaseHDTVHeader.objects.values('case_number').distinct().count() un = count_unique(CaseExchange.objects.values('case_number'), 'case_number') assert (len(un.values()) == UN_ce) assert (sum(un.values()) == N_ce) qs = CaseHDTVHeader.objects.filter(case_number__lt=4000000) ex = qs.all()[0] ex.service_calls print_tree( build_tree(qs, field='service_calls', ignore_fields=('id', 'case_number'))) # dispatch_status:Completed ? # T-> date_time:2008-09-15 12:25:34.270000? # T-> {1: 1} # F-> date_time:2008-07-09 08:49:36.437000? # T-> {0: 1} # F-> {None: 0} # F-> {None: 0} qs = CaseHDTVHeader.objects.filter(case_number__lt=2000000) ex = qs.all()[0] ex.service_calls print_tree( build_tree(qs, field='dispatch_status', ignore_fields=('id', 'case_number', 'service_calls'))) # dispatch_status:Completed ?