def __test__(): import measure, datetime data = [ {'a': 2, 'b': datetime.date(2003, 12, 5), 'c': 'A', 'cls': 1}, {'a': 7, 'b': datetime.date(2004, 12, 5), 'c': 'A', 'cls': 1}, {'a': 1, 'b': datetime.date(2007, 12, 5), 'c': 'A', 'cls': 2}, {'a': 9, 'b': datetime.date(2008, 12, 5), 'c': 'D', 'cls': 3}, {'a': 3, 'b': datetime.date(2009, 12, 5), 'c': 'B', 'cls': 1}, {'a': 2, 'b': datetime.date(2010, 12, 5), 'c': 'C', 'cls': 3}, ] entropy_impurity = measure.entropy(data, 'cls') giniidx_impurity = measure.giniidx(data, 'cls') cls_err_impurity = measure.cls_err(data, 'cls') print 'split' print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity) print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity) print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity, False) print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity, False) print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity, False) print print 'split' print interval(data, 'b', 'cls', measure.entropy, entropy_impurity) print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity) print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print interval(data, 'b', 'cls', measure.entropy, entropy_impurity, False) print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity, False) print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity, False) print print 'split' print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity) print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity) print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity, False) print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False) print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False) print print 'split' print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity) print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity) print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity, False) print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False) print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False) print
# print all combination of k and sd for k in xrange(2, 5): for sd_away in xrange(0, 5): try: kdist = dbscan.k_distance(dataset, k, sd_away) except: print 'kdist anchor out of range, skipped' print continue cluster = dbscan.dbscan(dataset, kdist, k) if len(cluster) == 0: print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster) print continue cc = [measure.cls_err(c) for c in cluster] errs = [] clss = [] for err, cls in cc: errs.append(err) clss.append(cls) cp = [len(c) for c in cluster] print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster) for i in xrange(0, len(cluster)): print 'cluster:', i, 'no. of pt. in cluster:', cp[i], 'impurity (classification error):', errs[i], 'majority:', clss[i] print 'mean impurity:', float(sum(errs)) / len(errs), 'sum of pt.:', sum(cp) print
for sd_away in xrange(0, 5): try: kdist = dbscan.k_distance(dataset, k, sd_away) except: print 'kdist anchor out of range, skipped' print continue cluster = dbscan.dbscan(dataset, kdist, k) if len(cluster) == 0: print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len( cluster) print continue cc = [measure.cls_err(c) for c in cluster] errs = [] clss = [] for err, cls in cc: errs.append(err) clss.append(cls) cp = [len(c) for c in cluster] print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len( cluster) for i in xrange(0, len(cluster)): print 'cluster:', i, 'no. of pt. in cluster:', cp[ i], 'impurity (classification error):', errs[ i], 'majority:', clss[i] print 'mean impurity:', float( sum(errs)) / len(errs), 'sum of pt.:', sum(cp)
def __test__(): import measure, datetime data = [ { 'a': 2, 'b': datetime.date(2003, 12, 5), 'c': 'A', 'cls': 1 }, { 'a': 7, 'b': datetime.date(2004, 12, 5), 'c': 'A', 'cls': 1 }, { 'a': 1, 'b': datetime.date(2007, 12, 5), 'c': 'A', 'cls': 2 }, { 'a': 9, 'b': datetime.date(2008, 12, 5), 'c': 'D', 'cls': 3 }, { 'a': 3, 'b': datetime.date(2009, 12, 5), 'c': 'B', 'cls': 1 }, { 'a': 2, 'b': datetime.date(2010, 12, 5), 'c': 'C', 'cls': 3 }, ] entropy_impurity = measure.entropy(data, 'cls') giniidx_impurity = measure.giniidx(data, 'cls') cls_err_impurity = measure.cls_err(data, 'cls') print 'split' print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity) print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity) print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity, False) print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity, False) print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity, False) print print 'split' print interval(data, 'b', 'cls', measure.entropy, entropy_impurity) print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity) print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print interval(data, 'b', 'cls', measure.entropy, entropy_impurity, False) print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity, False) print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity, False) print print 'split' print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity) print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity) print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity, False) print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False) print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False) print print 'split' print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity) print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity) print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity) print 'nosplit' print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity, False) print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False) print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False) print