def generate_scipy_comparison(csvPathname, col=0, h2oMedian=None, h2oMedian2=None): # this is some hack code for reading the csv and doing some percentile stuff in scipy # from numpy import loadtxt, genfromtxt, savetxt import numpy as np import scipy as sp dataset = np.genfromtxt( open(csvPathname, 'r'), delimiter=',', # skip_header=1, dtype=None); # guess! print "csv read for training, done" # we're going to strip just the last column for percentile work # used below NUMCLASSES = 10 print "csv read for training, done" # data is last column # drop the output print dataset.shape if len(dataset.shape) > 1: target = [x[col] for x in dataset] else: target = dataset # we may have read it in as a string. coerce to number targetFP = np.array(target, np.float) if 1==0: n_features = len(dataset[0]) - 1; print "n_features:", n_features # get the end # target = [x[-1] for x in dataset] # get the 2nd col print "histogram of target" print target print sp.histogram(target, bins=NUMCLASSES) print target[0] print target[1] thresholds = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] print "scipy per:", thresholds from scipy import stats # a = stats.scoreatpercentile(target, per=per) a = stats.mstats.mquantiles(targetFP, prob=thresholds) a2 = ["%.2f" % v for v in a] h2p.red_print("scipy stats.mstats.mquantiles:", a2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') label = '50%' if DO_MEDIAN else '99.9%' h2p.blue_print(label, "from sort:", b) s = a[5 if DO_MEDIAN else 10] h2p.blue_print(label, "from scipy:", s) h2p.blue_print(label, "from h2o summary2:", h2oMedian) h2p.blue_print(label, "from h2o quantile multipass:"******"%.2f" % v for v in a] h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", a2)
# this is type 7 alphap = 1 betap = 1 from scipy import stats a1 = stats.scoreatpercentile(target, per=100 * OTHER_T, interpolation_method='fraction') h2p.red_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.red_print("scipy stats.mstats.mquantiles:", a2) targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.red_print("sort algo:", b) h2p.red_print("from h2o (multi):", quantiles[0]) print "Now looking at the sorted list..same thing" h2p.blue_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.blue_print("scipy stats.mstats.mquantiles:", a2) b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.blue_print("sort algo:", b) h2p.blue_print("from h2o (multi):", quantiles[0])
QUANTILE = 0.25 print "stress the 1000 fixed binning based on (max-min)/1000" a = [ -1.0000002e10, -1.0000001e10, -1.0000000e10, -1.0000002e9, -1.0000001e9, -1.0000000e9, -1.0000002e6, -1.0000001e6, -1.0000000e6, -1.0000002e3, -1.0000001e3, -1.0000000e3, -1.0, 0.0000000, 1.0, 1.0000002e3, 1.0000001e3, 1.0000000e3, 1.0000002e6, 1.0000001e6, 1.0000000e6, 1.0000002e9, 1.0000001e9, 1.0000000e9, 1.0000002e10, 1.0000001e10, 1.0000000e10 ] initList = ["ddd = c(%s)" % ",".join(map(str, a))] # get expected result a.sort() expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear') print "expectedP:", expectedP h2p.blue_print("sort result, expectedP:", expectedP) exprList = [ ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1), ] class Basic(unittest.TestCase): def tearDown(self): h2o.check_sandbox_for_errors() @classmethod def setUpClass(cls): global SEED
1.0000000e6, 1.0000002e9, 1.0000001e9, 1.0000000e9, 1.0000002e10, 1.0000001e10, 1.0000000e10 ] initList = [ "ddd = c(%s)" % ",".join(map(str,a)) ] # get expected result a.sort() expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear') print "expectedP:", expectedP h2p.blue_print("sort result, expectedP:", expectedP) exprList = [ ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1), ] class Basic(unittest.TestCase): def tearDown(self): h2o.check_sandbox_for_errors() @classmethod def setUpClass(cls): global SEED SEED = h2o.setup_random_seed()
# an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 # this is type 7 alphap=1 betap=1 from scipy import stats a1 = stats.scoreatpercentile(target, per=100*OTHER_T, interpolation_method='fraction') h2p.red_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.red_print("scipy stats.mstats.mquantiles:", a2) targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.red_print("sort algo:", b) h2p.red_print( "from h2o (multi):", quantiles[0]) print "Now looking at the sorted list..same thing" h2p.blue_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.blue_print("scipy stats.mstats.mquantiles:", a2) b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.blue_print("sort algo:", b) h2p.blue_print( "from h2o (multi):", quantiles[0])