def process_dataset(self, parseResult, Y, e_coefs, e_ndev, e_rdev, e_aic, **kwargs): # no regularization kwargs['alpha'] = 0 kwargs['lambda'] = 0 kwargs['response'] = 'CAPSULE' glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, **kwargs) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response # the first submodel is the right one, if onely one lambda is provided as a parameter above glm_model = glmResult['glm_model'] submodels = glm_model['submodels'][0] validation = submodels['validation'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] errors = [] # FIX! our null deviance doesn't seem to match h2o.verboseprint("Comparing:", null_deviance, e_ndev) # if abs(float(nullDev) - e_ndev) > (0.001 * e_ndev): # errors.append('NullDeviance: %f != %s' % (e_ndev,nullDev)) # FIX! our res deviance doesn't seem to match h2o.verboseprint("Comparing:", residual_deviance, e_rdev) # if abs(float(resDev) - e_rdev) > (0.001 * e_rdev): # errors.append('ResDeviance: %f != %s' % (e_rdev,resDev)) # FIX! we don't have an AIC to compare? return errors
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours time.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) ### h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4 * 3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format( h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print( "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime < maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def do_scipy_glm(self, bucket, csvPathname, L, family='binomial'): h2p.red_print("Now doing sklearn") h2p.red_print("\nsee http://scikit-learn.org/0.11/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression") import numpy as np import scipy as sp from sklearn.linear_model import LogisticRegression from numpy import loadtxt csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) # make sure it does fp divide C = 1/(L+0.0) print "C regularization:", C dataset = np.loadtxt( open(csvPathnameFull,'r'), skiprows=1, # skip the header delimiter=',', dtype='float'); print "\ncsv read for training, done" n_features = len(dataset[0]) - 1; print "n_features:", n_features # don't want ID (col 0) or CAPSULE (col 1) # get CAPSULE target = [x[1] for x in dataset] # slice off the first 2 train = np.array ( [x[2:] for x in dataset] ) n_samples, n_features = train.shape print "n_samples:", n_samples, "n_features:", n_features print "histogram of target" print sp.histogram(target,3) print "len(train):", len(train) print "len(target):", len(target) print "dataset shape:", dataset.shape if family!='binomial': raise Exception("Only have binomial logistic for scipy") print "\nTrying l2" clf2 = LogisticRegression( C=C, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', tol=0.0001); # train the classifier start = time.time() clf2.fit(train, target) print "L2 fit took", time.time() - start, "seconds" # print "coefficients:", clf2.coef_ cstring = "".join([("%.5e " % c) for c in clf2.coef_[0]]) h2p.green_print("sklearn L2 C", C) h2p.green_print("sklearn coefficients:", cstring) h2p.green_print("sklearn intercept:", "%.5e" % clf2.intercept_[0]) h2p.green_print("sklearn score:", clf2.score(train,target)) print "\nTrying l1" clf1 = LogisticRegression( C=C, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l1', tol=0.0001); # train the classifier start = time.time() clf1.fit(train, target) print "L1 fit took", time.time() - start, "seconds" # print "coefficients:", clf1.coef_ cstring = "".join([("%.5e " % c) for c in clf1.coef_[0]]) h2p.green_print("sklearn L1 C", C) h2p.green_print("sklearn coefficients:", cstring) h2p.green_print("sklearn intercept:", "%.5e" % clf1.intercept_[0]) h2p.green_print("sklearn score:", clf1.score(train,target)) # attributes are accessed in the normal python way dx = clf1.__dict__ dx.keys()
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing_count)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 2, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 100, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 1000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if DO_TRY_SCIPY and colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() print scipyCol, pctile[10] generate_scipy_comparison(csvPathnameFull, col=scipyCol, # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single) h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult) h2i.delete_keys_at_all_nodes()
def test_summary2_int2B(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (100000, 1, 'B.hex', 2533255332, 2633256000, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # apparently we can't estimate any more # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0
def test_summary2_exp(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)), (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None, None)), (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)), (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. if schema=='put': h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + "\nMeans multi-machine with 'put' will fail") schema = 'local' if src_key and schema!='put': raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o_nodes.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) verboseprint("head:", head) verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema=='put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) # hmm.. what should importResult be in the put case # set it to None. No import is done, and shouldn't be used if you're doing schema='put' importResult = None return (None, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder? # is it used for key matching by others? # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets # importPattern = folderURI + "/" + pattern # could include this on the entire importPattern if we no longer have regex basename in h2o-dev? # folderURI = 'nfs:/' + folderPath folderURI = 'nfs:/' + os.path.realpath(folderPath) if importParentDir: finalImportString = folderPath else: finalImportString = folderPath + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head!="": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o_nodes.nodes[0] if schema=='s3' or node.redirect_import_folder_to_s3_path: # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='s3n' or node.redirect_import_folder_to_s3n_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def import_only( node=None, schema="local", bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs ): if src_key and schema != "put": raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) h2o.verboseprint("head:", head) h2o.verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema == "put": # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) # hmm.. what should importResult be in the put case # set it to None. No import is done, and shouldn't be used if you're doing schema='put' importResult = None return (None, key) if schema == "local" and not (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") folderURI = "nfs:/" + folderPath if importParentDir: importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip("/") # strip leading / in head if present if bucket and head != "": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o.nodes[0] if schema == "s3" or node.redirect_import_folder_to_s3_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema == "s3n" or node.redirect_import_folder_to_s3n_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node, ) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema == "maprfs": if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema == "hdfs": # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node, ) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def do_h2o_glm(self, bucket, csvPathname, L, family="binomial"): h2p.red_print("\nNow doing h2o") h2o.beta_features = True parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="local", timeoutSecs=180) # save the resolved pathname for use in the sklearn csv read below inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print inspect print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) x = "ID" y = "CAPSULE" family = family alpha = "0" lambda_ = L nfolds = "0" f = "prostate" modelKey = "GLM_" + f kwargs = { "response": y, "ignored_cols": x, "family": family, "lambda": lambda_, "alpha": alpha, "n_folds": nfolds, # passes if 0, fails otherwise "destination_key": modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o alpha ", alpha) h2p.green_print("h2o lambda ", lambda_) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response glm_model = glmResult["glm_model"] _names = glm_model["_names"] coefficients_names = glm_model["coefficients_names"] # the first submodel is the right one, if onely one lambda is provided as a parameter above submodels = glm_model["submodels"][0] beta = submodels["beta"] h2p.red_print("beta:", beta) norm_beta = submodels["norm_beta"] iteration = submodels["iteration"] validation = submodels["validation"] avg_err = validation["avg_err"] auc = validation["auc"] aic = validation["aic"] null_deviance = validation["null_deviance"] residual_deviance = validation["residual_deviance"] print "_names", _names print "coefficients_names", coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print "beta", beta print "iteration", iteration print "avg_err", avg_err print "auc", auc
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxDelta = 1 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def findQuantile(d, dmin, dmax, threshold): # return the value at the threshold, or the mean of the two rows that bound it. # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer maxIterations = 30 # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere) totalRows = len(d) # Used to have desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # initialize newValStart = dmin newValEnd = dmax newValRange = newValEnd - newValStart desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine. newValBinSize = newValRange / (desiredBinCnt + 0.0) newLowCount = 0 # count of rows below the bins # yes there is no newHighCount. Created during the pass, though. # state shared by each pass assert maxBinCnt > 0 hcnt2 = [None for b in range(maxBinCnt)] hcnt2_min = [None for b in range(maxBinCnt)] hcnt2_max = [None for b in range(maxBinCnt)] hcnt2_low = 0 hcnt2_high = 0 assert newValBinSize != 0 # can be negative assert newValEnd > newValStart assert newValRange > 0 # break out on stopping condition # reuse the histogram array hcnt2[] iteration = 0 done = False # append to a list of best guesses per pass best_result = [] def htot2(): return sum(hcnt2) + hcnt2_low + hcnt2_high while iteration <= maxIterations and not done: h2p.green_print("newValStart", newValStart) h2p.green_print("newValEnd", newValEnd) h2p.green_print("newValRange", newValRange) h2p.green_print("newValBinSize", newValBinSize) h2p.green_print("newLowCount", newLowCount) h2p.green_print("threshold", threshold) valStart = newValStart valEnd = newValEnd valRange = newValRange valBinSize = newValBinSize lowCount = newLowCount desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # playing with creating relative NUDGE values to make sure bin range # is always inclusive of target. # ratio it down from valBinSize. # It doesn't need to be as big as valBinSize. # implicitly, it shouldn't need to be as large as valBinSize # can't seem to make it work yet. leave NUDGE=0 NUDGE = 0 # init to zero for each pass for b in range(maxBinCnt): hcnt2[b] = 0.0 # Init counts outside of the bins hcnt2_low = 0 hcnt2_high = 0 # minimum value for higher than the bin. Needed for interpolation hcnt2_high_min = None for val in d: # Need to count the stuff outside the bin-gathering, # since threshold compare is based on total row compare # on first pass, shouldn't see anything exceed the start/end bounds # since those are min/max for the column? (shouldn't be any fp precision issue? or ??) # oh wait, this valOffset math creates possible precision issue? # maybe we should address it with the NUDGE value below? but what about first pass? valOffset = val - valStart # where are we zeroing in? (start) binIdx2 = int(math.floor(valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide? # do some close looking for possible fp arith issues cA = valOffset < 0 cB = binIdx2 < 0 t = {True: 1, False: 0} # we get the 10 case if ((cA and not cB) or (not cA and cB)): h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \ "binIdx2", binIdx2) cC = val > valEnd cD = binIdx2 >= (maxBinCnt-1) # tighten the compare for printing if ((cC and not cD) or (not cC and cD)): h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \ "binIdx2", binIdx2, "maxBinCnt", maxBinCnt) # example hits this case..i.e. the max value # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3 if valOffset < 0 or binIdx2<0: # if valOffset < 0: # if binIdx2<0: hcnt2_low += 1 # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure! # have to use both compares, since can wrap the index (due to start/end shift) # elif val > valEnd or binIdx2>=(maxBinCnt-1): # should this really be a valOffset compare? elif val > valEnd or binIdx2 >= maxBinCnt: # elif val > valEnd: # elif binIdx2>=(maxBinCnt-1): if (hcnt2_high==0) or (val < hcnt2_high_min): hcnt2_high_min = val; print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd hcnt2_high += 1 else: # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \ (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize) if hcnt2[binIdx2]==0 or (val < hcnt2_min[binIdx2]): hcnt2_min[binIdx2] = val; if hcnt2[binIdx2]==0 or (val > hcnt2_max[binIdx2]): hcnt2_max[binIdx2] = val; hcnt2[binIdx2] += 1 # check if we went into the magic extra bin if binIdx2 == (maxBinCnt-1): print "\nFP! val went into the extra maxBinCnt bin:", \ binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n" # check the legal states for these two # we don't have None for checking hcnt2_high_min in java assert hcnt2_high==0 or (hcnt2_high_min is not None) assert (hcnt2_high_min is None) or hcnt2_high!=0 # everything should either be in low, the bins, or high totalBinnedRows = htot2() print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) # now walk thru and find out what bin to look inside currentCnt = hcnt2_low targetCntFull = threshold * (totalRows-1) # zero based indexing targetCntInt = int(math.floor(threshold * (totalRows-1))) targetCntFract = targetCntFull - targetCntInt assert targetCntFract>=0 and targetCntFract<=1 print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract k = 0 while ((currentCnt + hcnt2[k]) <= targetCntInt): # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] currentCnt += hcnt2[k] # ugly but have to break out if we'd cycle along with == adding h0's until we go too far # are we supposed to advance to a none zero bin? k += 1 # goes over in the equal case? # if currentCnt >= targetCntInt: # break if k==maxBinCnt: break assert k<maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % (k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k-1]) # format string to match java Log.info() in Quantiles.java print "Found k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] assert hcnt2[k]!=1 or hcnt2_min[k]==hcnt2_max[k] # some possibily interpolating guesses first, in guess we have to iterate (best guess) done = False guess = (hcnt2_max[k] - hcnt2_min[k]) / 2 if currentCnt==targetCntInt: if hcnt2[k]>2 and (hcnt2_min[k]==hcnt2_max[k]): guess = hcnt2_min[k] print "Guess A", guess, k, hcnt2[k] if hcnt2[k]==2: print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop" # no mattter what size the fraction it would be on this number guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 # no mattter what size the fraction it would be on this number if INTERPOLATION_TYPE==2: # type 2 (mean) guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 else: # default to type 7 (linear interpolation) # Unlike mean, which just depends on two adjacent values, this adjustment # adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[k] # two adjacent..as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_min[k] + (pctDiff * dDiff) done = False print "Guess B", guess if hcnt2[k]==1 and targetCntFract==0: assert hcnt2_min[k]==hcnt2_max[k] guess = hcnt2_min[k] done = True print "k", k print "Guess C", guess if hcnt2[k]==1 and targetCntFract!=0: assert hcnt2_min[k]==hcnt2_max[k] print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero" if k<maxBinCnt: nextK = k + 1 # could put it over maxBinCnt else: nextK = k while nextK<maxBinCnt and hcnt2[nextK]==0: nextK += 1 # have the "extra bin" for this if nextK >= maxBinCnt: assert hcnt2_high!=0 print "Using hcnt2_high_min for interpolate:", hcnt2_high_min nextVal = hcnt2_high_min else: print "Using nextK for interpolate:", nextK assert hcnt2[nextK]!=0 nextVal = hcnt2_min[nextK] guess = (hcnt2_max[k] + nextVal) / 2.0 # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK if INTERPOLATION_TYPE==2: # type 2 (mean) guess = (hcnt2_max[k] + nextVal) / 2.0 pctDiff = 0.5 else: # default to type 7 (linear interpolation) dDiff = nextVal - hcnt2_max[k] # two adjacent, as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_max[k] + (pctDiff * dDiff) done = True # has to be one above us when needed. (or we're at end) print 'k', 'hcnt2_max[k]', 'nextVal' print "hello3:", k, hcnt2_max[k], nextVal print "\nInterpolating result using nextK: %s nextVal: %s" % (nextK, nextVal) print "Guess D", guess if not done: print "Not done, setting new range",\ "k: ", k,\ "currentCnt: ", currentCnt,\ "hcnt2_min[k]: ", hcnt2_min[k],\ "hcnt2_max[k]: ", hcnt2_max[k] # possible bin leakage at start/end edges due to fp arith. # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you. # Just need to check the one bin below and above k, if they exist. if k > 0 and hcnt2[k-1]>0 and (hcnt2_max[k-1]<hcnt2_min[k]): newValStart = hcnt2_max[k-1] else: newValStart = hcnt2_min[k] # subtle. we do put stuff in the extra end bin (see the print above that happens) # k might be pointing to one less than that (like k=0 for 1 bin case) if k < maxBinCnt and hcnt2[k+1]>0 and (hcnt2_min[k+1]>hcnt2_max[k]): print "hello" newValEnd = hcnt2_min[k+1] else: newValEnd = hcnt2_max[k] newValRange = newValEnd - newValStart # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues? newValBinSize = newValRange / (desiredBinCnt + 0.0) # the start/end should never change if we're just using one bin # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations # assumes NUDGE is 0 if NUDGE == 0.0: assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\ "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd) newLowCount = currentCnt if newValBinSize==0: # assert done or newValBinSize!=0 and live with current guess print "Assuming done because newValBinSize is 0." print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\ (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k]) guess = newValStart print "Guess E", guess done = True # if we have to interpolate # if it falls into this bin, interpolate to this bin means one answer? # cover the case above with multiple entris in a bin, all the same value # will be zero on the last pass? # assert newValBinSize != 0 or done # need the count up to but not including newValStart best_result.append(guess) iteration += 1 h2p.blue_print("Ending Pass", iteration) h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k]) print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high print "was", valStart, valEnd, valRange, valBinSize print "next", newValStart, newValEnd, newValRange, newValBinSize return best_result[-1]
def test_summary2_uniform_int_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() M = 100 tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'B.hex', 1, 1000 * M, ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)), (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0, 1000.0)), (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0, 20000.0)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0, -1250.0, 0)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0, 50000.0, 100000.0)), # (ROWS, 1, 'A.hex', 1, 101, ('C1', 1.0, 26.00, 51.00, 76.00, 101.0)), # (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -49.0, 0, 49.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0, 100.0)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0)), # (ROWS, 1, 'C.hex', -101, 101, ('C1', -101, -51, -1, 49.0, 100.0)), ] if not DO_REAL: # only 3 integer values! tryList.append(\ (1000000, 1, 'x.hex', -1, 1, ('C1', -1.0, -1, 0.000, 1, 1.00)) \ ) timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. # if schema=='put': # h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + # "\nMeans multi-machine with 'put' will fail") # schema = 'local' if src_key and schema != 'put': raise Exception( "can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o_nodes.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) verboseprint("head:", head) verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema == 'put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception( "h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception( "Aborting due to abort_after_import (-aai) argument's effect in import_only()" ) # h2o-dev: it always wants a key name if src_key is None: src_key = filename key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) # hmm.. what should importResult be in the put case # set it to None. No import is done, and shouldn't be used if you're doing schema='put' # ..make it look like an import files result..This is just for test consistency importResult = json.loads('{\ "dels": [],\ "fails": [],\ "files": ["%s"],\ "keys": ["%s"],\ "path": "%s",\ "schema_name": null, "schema_type": null, "schema_version": null\ }' % (filename, src_key, filePath)) return (importResult, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception( "Aborting due to abort_after_import (-aai) argument's effect in import_only()" ) # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder? # is it used for key matching by others? # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets # importPattern = folderURI + "/" + pattern # could include this on the entire importPattern if we no longer have regex basename in h2o-dev? folderURI = 'nfs:/' + folderPath # folderURI = 'nfs:/' + os.path.realpath(folderPath) if importParentDir: finalImportString = folderPath else: finalImportString = folderPath + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head != "": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head if h2o_args.abort_after_import: raise Exception( "Aborting due to abort_after_import (-aai) argument's effect in import_only()" ) n = h2o_nodes.nodes[0] if schema == 's3' or node.redirect_import_folder_to_s3_path: # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema == 's3n' or node.redirect_import_folder_to_s3n_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema == 'maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema == 'hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2550.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxDelta = 0.5 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i!=0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1==0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols,1) self.assertEqual(numRows,len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform_int_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() M = 100 tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, "B.hex", 1, 1000 * M, ("C1", 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)), (ROWS, 1, "B.hex", 1, 1000, ("C1", 1.0, 250.0, 500.0, 750.0, 1000.0)), (ROWS, 1, "x.hex", 1, 20000, ("C1", 1.0, 5000.0, 10000.0, 15000.0, 20000.0)), (ROWS, 1, "x.hex", -5000, 0, ("C1", -5000.00, -3750.0, -2500.0, -1250.0, 0)), (ROWS, 1, "x.hex", -100000, 100000, ("C1", -100000.0, -50000.0, 0, 50000.0, 100000.0)), # (ROWS, 1, 'A.hex', 1, 101, ('C1', 1.0, 26.00, 51.00, 76.00, 101.0)), # (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -49.0, 0, 49.00, 99)), (ROWS, 1, "B.hex", 1, 10000, ("C1", 1.0, 2501.0, 5001.0, 7501.0, 10000.0)), (ROWS, 1, "B.hex", -100, 100, ("C1", -100.0, -50.0, 0.0, 50.0, 100.0)), (ROWS, 1, "C.hex", 1, 100000, ("C1", 1.0, 25001.0, 50001.0, 75001.0, 100000.0)), # (ROWS, 1, 'C.hex', -101, 101, ('C1', -101, -51, -1, 49.0, 100.0)), ] if not DO_REAL: # only 3 integer values! tryList.append((1000000, 1, "x.hex", -1, 1, ("C1", -1.0, -1, 0.000, 1, 1.00))) timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = (expectedMax - expectedMin) / (MAX_QBINS + 0.0) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * 0.01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=60, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult["summaries"][0] colname = column["colname"] self.assertEqual(colname, expected[0]) coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, rowCount / len(hcnt), delta=0.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (5000000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (5000000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (1000000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (1000000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (1000000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (1000000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (1000000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (1000000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (1000000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (1000000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, trial in enumerate(thresholds): execExpr = "quantile(%s[,1], c(%s));" % (hex_key, trial) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) ex = twoDecimals(result) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (trial, ex, pt[i])) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='percentile: % is not expected: %s' % (result, pctile[i])) if DO_TRY_SCIPY: generate_scipy_comparison(csvPathnameFull)
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), # (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), # (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), # (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 class Column(object): def __init__(self, column): assert isinstance(column, dict) for k,v in column.iteritems(): setattr(self, k, v) # achieves self.k = v for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvFilename # column 0? summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1') h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # default_pctiles # isText # rows # off # key # checksum # only one column columns = summaryResult['frames'][0]['columns'] default_pctiles = summaryResult['frames'][0]['default_pctiles'] co = Column(columns[0]) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" pctiles = [0] + co.pctiles + [0] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(co.label, expected[0]) if expected[1]: h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) print "co.label:", co.label, "co.maxs (2 places):", mx print "co.label:", co.label, "co.mins (2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "h2oSummary2MaxErr", maxErr if co.label!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctiles[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def findQuantile(d, dmin, dmax, threshold): # return the value at the threshold, or the mean of the two rows that bound it. # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer maxIterations = 30 # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere) totalRows = len(d) # Used to have desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # initialize newValStart = dmin newValEnd = dmax newValRange = newValEnd - newValStart desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine. newValBinSize = newValRange / (desiredBinCnt + 0.0) newLowCount = 0 # count of rows below the bins # yes there is no newHighCount. Created during the pass, though. # state shared by each pass assert maxBinCnt > 0 hcnt2 = [None for b in range(maxBinCnt)] hcnt2_min = [None for b in range(maxBinCnt)] hcnt2_max = [None for b in range(maxBinCnt)] hcnt2_low = 0 hcnt2_high = 0 assert newValBinSize != 0 # can be negative assert newValEnd > newValStart assert newValRange > 0 # break out on stopping condition # reuse the histogram array hcnt2[] iteration = 0 done = False # append to a list of best guesses per pass best_result = [] def htot2(): return sum(hcnt2) + hcnt2_low + hcnt2_high while iteration <= maxIterations and not done: h2p.green_print("newValStart", newValStart) h2p.green_print("newValEnd", newValEnd) h2p.green_print("newValRange", newValRange) h2p.green_print("newValBinSize", newValBinSize) h2p.green_print("newLowCount", newLowCount) h2p.green_print("threshold", threshold) valStart = newValStart valEnd = newValEnd valRange = newValRange valBinSize = newValBinSize lowCount = newLowCount desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # playing with creating relative NUDGE values to make sure bin range # is always inclusive of target. # ratio it down from valBinSize. # It doesn't need to be as big as valBinSize. # implicitly, it shouldn't need to be as large as valBinSize # can't seem to make it work yet. leave NUDGE=0 NUDGE = 0 # init to zero for each pass for b in range(maxBinCnt): hcnt2[b] = 0.0 # Init counts outside of the bins hcnt2_low = 0 hcnt2_high = 0 # minimum value for higher than the bin. Needed for interpolation hcnt2_high_min = None for val in d: # Need to count the stuff outside the bin-gathering, # since threshold compare is based on total row compare # on first pass, shouldn't see anything exceed the start/end bounds # since those are min/max for the column? (shouldn't be any fp precision issue? or ??) # oh wait, this valOffset math creates possible precision issue? # maybe we should address it with the NUDGE value below? but what about first pass? valOffset = val - valStart # where are we zeroing in? (start) binIdx2 = int(math.floor( valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide? # do some close looking for possible fp arith issues cA = valOffset < 0 cB = binIdx2 < 0 t = {True: 1, False: 0} # we get the 10 case if ((cA and not cB) or (not cA and cB)): h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \ "binIdx2", binIdx2) cC = val > valEnd cD = binIdx2 >= (maxBinCnt - 1) # tighten the compare for printing if ((cC and not cD) or (not cC and cD)): h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \ "binIdx2", binIdx2, "maxBinCnt", maxBinCnt) # example hits this case..i.e. the max value # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3 if valOffset < 0 or binIdx2 < 0: # if valOffset < 0: # if binIdx2<0: hcnt2_low += 1 # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure! # have to use both compares, since can wrap the index (due to start/end shift) # elif val > valEnd or binIdx2>=(maxBinCnt-1): # should this really be a valOffset compare? elif val > valEnd or binIdx2 >= maxBinCnt: # elif val > valEnd: # elif binIdx2>=(maxBinCnt-1): if (hcnt2_high == 0) or (val < hcnt2_high_min): hcnt2_high_min = val print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd hcnt2_high += 1 else: # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \ (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize) if hcnt2[binIdx2] == 0 or (val < hcnt2_min[binIdx2]): hcnt2_min[binIdx2] = val if hcnt2[binIdx2] == 0 or (val > hcnt2_max[binIdx2]): hcnt2_max[binIdx2] = val hcnt2[binIdx2] += 1 # check if we went into the magic extra bin if binIdx2 == (maxBinCnt - 1): print "\nFP! val went into the extra maxBinCnt bin:", \ binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n" # check the legal states for these two # we don't have None for checking hcnt2_high_min in java assert hcnt2_high == 0 or (hcnt2_high_min is not None) assert (hcnt2_high_min is None) or hcnt2_high != 0 # everything should either be in low, the bins, or high totalBinnedRows = htot2() print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) # now walk thru and find out what bin to look inside currentCnt = hcnt2_low targetCntFull = threshold * (totalRows - 1) # zero based indexing targetCntInt = int(math.floor(threshold * (totalRows - 1))) targetCntFract = targetCntFull - targetCntInt assert targetCntFract >= 0 and targetCntFract <= 1 print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract k = 0 while ((currentCnt + hcnt2[k]) <= targetCntInt): # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] currentCnt += hcnt2[k] # ugly but have to break out if we'd cycle along with == adding h0's until we go too far # are we supposed to advance to a none zero bin? k += 1 # goes over in the equal case? # if currentCnt >= targetCntInt: # break if k == maxBinCnt: break assert k < maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % ( k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k - 1]) # format string to match java Log.info() in Quantiles.java print "Found k (multi): ", k, " ", currentCnt, " ", targetCntInt, " ", totalRows, " ", hcnt2[ k], " ", hcnt2_min[k], " ", hcnt2_max[k] assert hcnt2[k] != 1 or hcnt2_min[k] == hcnt2_max[k] # some possibily interpolating guesses first, in guess we have to iterate (best guess) done = False guess = (hcnt2_max[k] - hcnt2_min[k]) / 2 # we maight not have gottent all the way if currentCnt == targetCntInt: if hcnt2[k] > 2 and (hcnt2_min[k] == hcnt2_max[k]): guess = hcnt2_min[k] print "Guess A", guess, k, hcnt2[k] if hcnt2[k] == 2: print "hello" print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop" # no mattter what size the fraction it would be on this number guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 # no mattter what size the fraction it would be on this number if INTERPOLATION_TYPE == 2: # type 2 (mean) guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 else: # default to type 7 (linear interpolation) # Unlike mean, which just depends on two adjacent values, this adjustment # adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[ k] # two adjacent..as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_min[k] + (pctDiff * dDiff) done = False print "Guess B", guess if hcnt2[k] == 1 and targetCntFract == 0: assert hcnt2_min[k] == hcnt2_max[k] guess = hcnt2_min[k] done = True print "k", k print "Guess C", guess if hcnt2[k] == 1 and targetCntFract != 0: assert hcnt2_min[k] == hcnt2_max[k] print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero" if k < maxBinCnt: nextK = k + 1 # could put it over maxBinCnt else: nextK = k while nextK < maxBinCnt and hcnt2[nextK] == 0: nextK += 1 # have the "extra bin" for this if nextK >= maxBinCnt: assert hcnt2_high != 0 print "Using hcnt2_high_min for interpolate:", hcnt2_high_min nextVal = hcnt2_high_min else: print "Using nextK for interpolate:", nextK assert hcnt2[nextK] != 0 nextVal = hcnt2_min[nextK] guess = (hcnt2_max[k] + nextVal) / 2.0 # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK if INTERPOLATION_TYPE == 2: # type 2 (mean) guess = (hcnt2_max[k] + nextVal) / 2.0 pctDiff = 0.5 else: # default to type 7 (linear interpolation) dDiff = nextVal - hcnt2_max[ k] # two adjacent, as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_max[k] + (pctDiff * dDiff) done = True # has to be one above us when needed. (or we're at end) print 'k', 'hcnt2_max[k]', 'nextVal' print "hello3:", k, hcnt2_max[k], nextVal print "\nInterpolating result using nextK: %s nextVal: %s" % ( nextK, nextVal) print "Guess D", guess if not done: print "%s %s %s %s Not done, setting new range" % (hcnt2[k], currentCnt, targetCntInt, targetCntFract),\ "k: ", k,\ "currentCnt: ", currentCnt,\ "hcnt2_min[k]: ", hcnt2_min[k],\ "hcnt2_max[k]: ", hcnt2_max[k] # possible bin leakage at start/end edges due to fp arith. # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you. # Just need to check the one bin below and above k, if they exist. if k > 0 and hcnt2[k - 1] > 0 and (hcnt2_max[k - 1] < hcnt2_min[k]): print "1" newValStart = hcnt2_max[k - 1] else: print "2" newValStart = hcnt2_min[k] # subtle. we do put stuff in the extra end bin (see the print above that happens) # k might be pointing to one less than that (like k=0 for 1 bin case) if k < maxBinCnt and hcnt2[k + 1] > 0 and (hcnt2_min[k + 1] > hcnt2_max[k]): print "3" newValEnd = hcnt2_min[k + 1] else: print "4" newValEnd = hcnt2_max[k] newValRange = newValEnd - newValStart # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues? newValBinSize = newValRange / (desiredBinCnt + 0.0) # the start/end should never change if we're just using one bin # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations # assumes NUDGE is 0 if NUDGE == 0.0: assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\ "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd) newLowCount = currentCnt if newValBinSize == 0: # assert done or newValBinSize!=0 and live with current guess print "Assuming done because newValBinSize is 0." print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\ (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k]) guess = newValStart print "Guess E", guess # was done = True 3/20/14 done = True # if we have to interpolate # if it falls into this bin, interpolate to this bin means one answer? # cover the case above with multiple entries in a bin, all the same value # will be zero on the last pass? # assert newValBinSize != 0 or done # need the count up to but not including newValStart best_result.append(guess) iteration += 1 h2p.blue_print("Ending Pass", iteration) h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k]) print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high print "was", valStart, valEnd, valRange, valBinSize print "next", newValStart, newValEnd, newValRange, newValBinSize return best_result[-1]
def runSummary(node=None, key=None, column=None, expected=None, maxDelta=None, noPrint=False, **kwargs): if not key: raise Exception('No key for Summary') if not node: node = h2o_nodes.nodes[0] # return node.summary(key, **kwargs) i = InspectObj(key=key) # just so I don't have to change names below missingList = i.missingList labelList = i.labelList numRows = i.numRows numCols = i.numCols print "labelList:", labelList assert labelList is not None # doesn't take indices? only column labels? # return first column, unless specified if not (column is None or isinstance(column, (basestring, int))): raise Exception( "column param should be string or integer index or None %s %s" % (type(column), column)) # either return the first col, or the col indentified by label. the column identifed could be string or index? if column is None: # means the summary json when we ask for col 0, will be what we return (do all though) colNameToDo = labelList colIndexToDo = range(len(labelList)) elif isinstance(column, int): colNameToDo = [labelList[column]] colIndexToDo = [column] elif isinstance(column, basestring): colNameToDo = [column] if column not in labelList: raise Exception("% not in labellist: %s" % (column, labellist)) colIndexToDo = [labelList.index(column)] else: raise Exception("wrong type %s for column %s" % (type(column), column)) # we get the first column as result after walking across all, if no column parameter desiredResult = None for (colIndex, colName) in zip(colIndexToDo, colNameToDo): print "doing summary on %s %s" % (colIndex, colName) # ugly looking up the colIndex co = SummaryObj(key=key, colIndex=colIndex, colName=colName) if not desiredResult: desiredResult = co if not noPrint: for k, v in co: # only print [0] of mins and maxs because of the e308 values when they don't have dataset values if k == 'mins' or k == 'maxs': print "%s[0]" % k, v[0] else: print k, v if expected is not None: print "len(co.histogram_bins):", len(co.histogram_bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals( co.sigma) # print "FIX! hacking the co.percentiles because it's short by two" # if co.percentiles: # percentiles = [0] + co.percentiles + [0] # else: # percentiles = None percentiles = co.percentiles assert len(co.percentiles) == len(co.default_percentiles) # the thresholds h2o used, should match what we expected # expected = [0] * 5 # Fix. doesn't check for expected = 0? # max of one bin if maxDelta is None: maxDelta = (co.maxs[0] - co.mins[0]) / 1000 if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, msg='min is not approx. expected') if expected[1]: h2o_util.assertApproxEqual( percentiles[2], expected[1], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( percentiles[4], expected[2], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( percentiles[6], expected[3], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort MAX_QBINS = 1000 if expected[0] and expected[4]: expectedRange = expected[4] - expected[0] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(percentiles) # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column mx = h2o_util.twoDecimals(co.maxs[0]) mn = h2o_util.twoDecimals(co.mins[0]) print "co.label:", co.label, "co.percentiles (2 places):", pt print "co.default_percentiles:", co.default_percentiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! why would percentiles be None? enums? if pt is None: compareActual = mn, [None] * 3, mx else: compareActual = mn, pt[2], pt[4], pt[6], mx h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected) return desiredResult
def test_rand_inspect(self): ### h2b.browseTheCloud() ### h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = 'UCI/UCI-large/covtype/'+ csvFilename hex_key = csvFilename + ".hex" print "\n" + csvPathname parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) destination_key = parseResult['destination_key'] print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", destination_key def inspect_and_check(nodeX, destination_key, offset, view, inspectOld=None): inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view) if h2o.beta_features: pass # print "Inspect2:", h2o.dump_json(inspectNew) else: pass # print "Inspect:", h2o.dump_json(inspectNew) # FIX! get min/max/mean/variance for a col too? constantNames = [ ('num_cols', 'numCols'), ('num_rows', 'numRows'), ('value_size_bytes', 'byteSize'), ('cols', 'cols'), ] colNames = [ ('num_missing_values', 'naCnt'), ] for (i,j) in constantNames: # check the fields, even if you don't have a previous one to compare to if h2o.beta_features: # hack in extra info for now, from the new names to old names if not j in inspectNew: raise Exception("Can't find %s, Inspect2 result should have it?" % j) inspectNew[i] = inspectNew[j] # don't compare if cols if inspectOld and i != 'cols': if h2o.beta_features and i=='value_size_bytes': # Inspect2 should be smaller self.assertGreater(inspectOld[i], inspectNew[i]) else: # for cols it will just compare length? self.assertEqual(inspectOld[i], inspectNew[i]) if i=='cols': for (m,n) in colNames: if h2o.beta_features: if not n in inspectNew[i][0]: print h2o.dump_json(inspectNew[i][0]) raise Exception("Can't find %s, Inspect2 result['cols'][0] should have it?" % n) inspectNew[i][0][m] = inspectNew[i][0][n] # just compare 0 if inspectOld is not None: self.assertEqual(inspectOld[i][0][m], inspectNew[i][0][m]) return inspectNew # going to use this to compare against future. num_rows/num_cols should always # be the same, regardless of the view. just a coarse sanity check origInspect = inspect_and_check(0, destination_key, 0, 1, None) h2o.verboseprint(h2o.dump_json(origInspect)) origStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60) num_rows = origInspect['num_rows'] num_cols = origInspect['num_cols'] lenNodes = len(h2o.nodes) for trial in range (10): h2p.green_print("\nTrial", trial) # we want to use the boundary conditions, so have two level of random choices offset = good_choices(num_rows) view = good_choices(num_cols) # randomize the node used nodeX = random.randint(0,lenNodes-1) print "nodeX:", nodeX, "offset:", offset, "view:", view h2o.beta_features = False inspect_and_check(nodeX,destination_key,offset,view,origInspect) print "trying Inspect2 by flipping h2o.nodes[0].beta_features" h2o.beta_features = True # delay between the two inspects...bug around not getting autoframe in storeview? time.sleep(1) inspect_and_check(nodeX,destination_key,offset,view,origInspect) h2o.beta_features = False # a fvec frame should have been created in the storeView time.sleep(1) # loop looking for the autoframe to show up # o = len(origStoreViewResult['keys']) o = h2i.count_keys_at_all_nodes() retry = 0 okay = False while retry==0 or not okay: newStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60) ## p = len(newStoreViewResult['keys']) p = h2i.count_keys_at_all_nodes() print "number of keys in the two StoreViews, o:", o, "p:", p ## print "newStoreViewResult:", h2o.dump_json(newStoreViewResult) oOkay = {1, 2, 3, 4, 5, 6} pOkay = {1, 2, 3, 4, 5} print o, pOkay, p, oOkay if (o in oOkay) and (p in pOkay): print "Good" okay = True else: print "Unexpected o,p after autoframe, looking at total keys in system: %s %s" % (o,p) if retry==10: raise Exception("StoreView didn't get autoframe, after %s retries" % retry) ## h2b.browseJsonHistoryAsUrlLastMatch("StoreView") # so he gets recreated?? deleted = h2i.delete_keys_at_all_nodes(pattern='autoframe') # The autoframe key may not show up!! if INVISIBLE_AUTOFRAME: # can be 1 or 2 if not(deleted==0 or deleted==1): msg = "Should have deleted a total of 0 or 1 keys, looking at all nodes. Did %s" % deleted raise Exception(msg) else: # can be 1 or 2 if not(deleted==1): msg = "Should have deleted a total of 1 keys, looking at all nodes. Did %s" % deleted time.sleep(1) retry += 1
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5 * ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5 * ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1 * ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1 * ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1 * ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1 * ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1 * ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]), (1 * ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1 * ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals( co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str, probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model(algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][ 0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct= [0.001, 0.001, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_summary2_small(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 100, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 1000, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg= "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?" ) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, numRows / len(hcnt), delta=1 + .01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def test_summary2_int2B(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (100000, 1, 'B.hex', 2533255332, 2633256000, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # apparently we can't estimate any more # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0
def runSummary(node=None, key=None, expected=None, column=None, **kwargs): if not key: raise Exception('No key for Summary') if not node: node = h2o_nodes.nodes[0] # return node.summary(key, **kwargs) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k,v in column.iteritems(): setattr(self, k, v) # achieves self.k = v def __iter__(self): for attr, value in self.__dict__.iteritems(): yield attr, value inspect = runInspect(key=key) # change missingList definition: None if all empty, otherwise align to cols. 0 if 0? missingList, labelList, numRows, numCols = infoFromInspect(inspect) # doesn't take indices? only column labels? lastChecksum = None # return first column, unless specified desiredResult = None for label in labelList: print "doing summary on %s" % label summaryResult = node.summary(key=key, column=label) if not desiredResult or (column and column==label): desiredResult = summaryResult verboseprint("column", column, "summaryResult:", dump_json(summaryResult)) # this should be the same for all the cols? Or does the checksum change? frame = summaryResult['frames'][0] default_pctiles = frame['default_pctiles'] checksum = frame['checksum'] rows = frame['rows'] columns = frame['columns'] # assert len(columns) == numCols assert rows == numRows assert checksum !=0 and checksum is not None assert rows!=0 and rows is not None assert not frame['isText'] # FIX! why is frame['key'] = None here? # assert frame['key'] == key, "%s %s" % (frame['key'], key) # it changes? # assert not lastChecksum or lastChecksum == checksum lastChecksum = checksum # only one column co = Column(columns[0]) # how are enums binned. Stride of 1? (what about domain values) coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] # for c in coList: # print c for k,v in co: print k, v print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" if co.pctiles: pctiles = [0] + co.pctiles + [0] else: pctiles = None # the thresholds h2o used, should match what we expected if expected ==None: expected = [0] * 5 # Fix. doesn't check for expected = 0? if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, msg='min is not approx. expected') if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort MAX_QBINS = 1000 if expected[0] and expected[4]: expectedRange = expected[4] - expected[0] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! why would pctiles be None? enums? if pt is None: compareActual = mn[0], [None] * 3, mx[0] else: compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected) return desiredResult
def test_summary2_uniform_w_NA(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1 + NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1 + NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2 * e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs): if src_key and schema!='put': raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) h2o.verboseprint("head:", head) h2o.verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema=='put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) return (None, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") folderURI = 'nfs:/' + folderPath if importParentDir: importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head!="": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o.nodes[0] if schema=='s3' or node.redirect_import_folder_to_s3_path: folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs) elif schema=='s3n' or node.redirect_import_folder_to_s3n_path: if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema=='maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema=='hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, **kwargs): # no bucket is sometimes legal (fixed path) if not node: node = h2o.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) h2o.verboseprint("head:", head) h2o.verboseprint("pattern:", pattern) # to train users / okay here if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) if schema=='put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) return (None, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") folderURI = 'nfs:/' + folderPath importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head!="": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o.nodes[0] if schema=='s3' or node.redirect_import_folder_to_s3_path: folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs) elif schema=='s3n' or node.redirect_import_folder_to_s3n_path: if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) elif schema=='maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "maprfs:///" + folderOffset importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) elif schema=='hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing_count, co.domain, sum(co.histogram_bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing_count)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def do_h2o_glm(self, bucket, csvPathname, L, family='binomial'): h2p.red_print("\nNow doing h2o") parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', timeoutSecs=180) # save the resolved pathname for use in the sklearn csv read below inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print inspect print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) x = 'ID' y = 'CAPSULE' family = family alpha = '0' lambda_ = L nfolds = '0' f = 'prostate' modelKey = 'GLM_' + f kwargs = { 'response' : y, 'ignored_cols' : x, 'family' : family, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, # passes if 0, fails otherwise 'destination_key' : modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o alpha ", alpha) h2p.green_print("h2o lambda ", lambda_) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response glm_model = glmResult['glm_model'] _names = glm_model['_names'] coefficients_names = glm_model['coefficients_names'] # the first submodel is the right one, if onely one lambda is provided as a parameter above submodels = glm_model['submodels'][0] beta = submodels['beta'] h2p.red_print("beta:", beta) norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc
def runSummary(node=None, key=None, column=None, expected=None, maxDelta=None, noPrint=False, **kwargs): if not key: raise Exception('No key for Summary') if not node: node = h2o_nodes.nodes[0] # return node.summary(key, **kwargs) i = InspectObj(key=key) # just so I don't have to change names below missingList = i.missingList labelList = i.labelList numRows = i.numRows numCols = i.numCols # doesn't take indices? only column labels? # return first column, unless specified if not (column is None or isinstance(column, (basestring, int))): raise Exception("column param should be string or integer index or None %s %s" % (type(column), column)) # either return the first col, or the col indentified by label. the column identifed could be string or index? if column is None: # means the summary json when we ask for col 0, will be what we return (do all though) colNameToDo = labelList colIndexToDo = range(len(labelList)) elif isinstance(column, int): colNameToDo = [labelList[column]] colIndexToDo = [column] elif isinstance(column, basestring): colNameToDo = [column] colIndexToDo = [labelList.index[column]] else: raise Exception("wrong type %s for column %s" % (type(column), column)) # we get the first column as result after walking across all, if no column parameter desiredResult = None for (colIndex, colName) in zip(colIndexToDo, colNameToDo): print "doing summary on %s %s" % (colIndex, colName) # ugly looking up the colIndex co = SummaryObj(key=key, colIndex=colIndex, colName=colName) if not desiredResult: desiredResult = co if not noPrint: for k,v in co: # only print [0] of mins and maxs because of the e308 values when they don't have dataset values if k=='mins' or k=='maxs': print "%s[0]" % k, v[0] else: print k, v if expected is not None: print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" if co.pctiles: pctiles = [0] + co.pctiles + [0] else: pctiles = None # the thresholds h2o used, should match what we expected # expected = [0] * 5 # Fix. doesn't check for expected = 0? # max of one bin if maxDelta is None: maxDelta = (co.maxs[0] - co.mins[0])/1000 if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, msg='min is not approx. expected') if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort MAX_QBINS = 1000 if expected[0] and expected[4]: expectedRange = expected[4] - expected[0] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column mx = h2o_util.twoDecimals(co.maxs[0]) mn = h2o_util.twoDecimals(co.mins[0]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "co.default_pctiles:", co.default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! why would pctiles be None? enums? if pt is None: compareActual = mn, [None] * 3, mx else: compareActual = mn, pt[3], pt[5], pt[7], mx h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected) return desiredResult
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i != 0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % ( hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print( "\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception( "exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual( result, pctile[i], tol=maxDelta, msg= 'exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1 == 0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % ( hex_key, ",".join(map(str, thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % ( hex_key, ",".join(map(str, thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols, 1) self.assertEqual(numRows, len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5*ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5*ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1*ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1*ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1*ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1*ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1*ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100,00]), (1*ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1*ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str,probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model( algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2*e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def test_rand_inspect(self): ### h2b.browseTheCloud() ### h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = csvFilename + ".hex" print "\n" + csvPathname parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) destination_key = parseResult['destination_key'] print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", destination_key def inspect_and_check(nodeX, destination_key, offset, view, inspectOld=None): inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view) if h2o.beta_features: pass # print "Inspect2:", h2o.dump_json(inspectNew) else: pass # print "Inspect:", h2o.dump_json(inspectNew) # FIX! get min/max/mean/variance for a col too? constantNames = [ ('num_cols', 'numCols'), ('num_rows', 'numRows'), ('value_size_bytes', 'byteSize'), ('cols', 'cols'), ] colNames = [ ('num_missing_values', 'naCnt'), ] for (i, j) in constantNames: # check the fields, even if you don't have a previous one to compare to if h2o.beta_features: # hack in extra info for now, from the new names to old names if not j in inspectNew: raise Exception( "Can't find %s, Inspect2 result should have it?" % j) inspectNew[i] = inspectNew[j] # don't compare if cols if inspectOld and i != 'cols': if h2o.beta_features and i == 'value_size_bytes': # Inspect2 should be smaller self.assertGreater(inspectOld[i], inspectNew[i]) else: # for cols it will just compare length? self.assertEqual(inspectOld[i], inspectNew[i]) if i == 'cols': for (m, n) in colNames: if h2o.beta_features: if not n in inspectNew[i][0]: print h2o.dump_json(inspectNew[i][0]) raise Exception( "Can't find %s, Inspect2 result['cols'][0] should have it?" % n) inspectNew[i][0][m] = inspectNew[i][0][n] # just compare 0 if inspectOld is not None: self.assertEqual(inspectOld[i][0][m], inspectNew[i][0][m]) return inspectNew # going to use this to compare against future. num_rows/num_cols should always # be the same, regardless of the view. just a coarse sanity check origInspect = inspect_and_check(0, destination_key, 0, 1, None) h2o.verboseprint(h2o.dump_json(origInspect)) origStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60) num_rows = origInspect['num_rows'] num_cols = origInspect['num_cols'] lenNodes = len(h2o.nodes) for trial in range(10): h2p.green_print("\nTrial", trial) # we want to use the boundary conditions, so have two level of random choices offset = good_choices(num_rows) view = good_choices(num_cols) # randomize the node used nodeX = random.randint(0, lenNodes - 1) print "nodeX:", nodeX, "offset:", offset, "view:", view h2o.beta_features = False inspect_and_check(nodeX, destination_key, offset, view, origInspect) print "trying Inspect2 by flipping h2o.nodes[0].beta_features" h2o.beta_features = True # delay between the two inspects...bug around not getting autoframe in storeview? time.sleep(1) inspect_and_check(nodeX, destination_key, offset, view, origInspect) h2o.beta_features = False # a fvec frame should have been created in the storeView time.sleep(1) # loop looking for the autoframe to show up # o = len(origStoreViewResult['keys']) o = h2i.count_keys_at_all_nodes() retry = 0 okay = False while retry == 0 or not okay: newStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60) ## p = len(newStoreViewResult['keys']) p = h2i.count_keys_at_all_nodes() print "number of keys in the two StoreViews, o:", o, "p:", p ## print "newStoreViewResult:", h2o.dump_json(newStoreViewResult) oOkay = {1, 2, 3, 4, 5, 6, 7, 8} pOkay = {1, 2, 3, 4, 5} print o, pOkay, p, oOkay if (o in oOkay) and (p in pOkay): print "Good" okay = True else: print "Unexpected o,p after autoframe, looking at total keys in system: %s %s" % ( o, p) if retry == 10: raise Exception( "StoreView didn't get autoframe, after %s retries" % retry) ## h2b.browseJsonHistoryAsUrlLastMatch("StoreView") # so he gets recreated?? deleted = h2i.delete_keys_at_all_nodes(pattern='autoframe') # The autoframe key may not show up!! if INVISIBLE_AUTOFRAME: # can be 1 or 2 if not (deleted == 0 or deleted == 1): msg = "Should have deleted a total of 0 or 1 keys, looking at all nodes. Did %s" % deleted raise Exception(msg) else: # can be 1 or 2 if not (deleted == 1): msg = "Should have deleted a total of 1 keys, looking at all nodes. Did %s" % deleted time.sleep(1) retry += 1
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else 0.999 q = h2o.nodes[0].quantiles( source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, ) qresult = q["result"] qresult_single = q["result_single"] qresult_iterations = q["iterations"] qresult_interpolated = q["interpolated"] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?", ) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def test_summary2_exp(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)), (100, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)), (1000, 1, "x.hex", -5000, 0, ("C1", None, None, None, None, None)), (10000, 1, "x.hex", -100000, 100000, ("C1", None, None, None, None, None)), (100000, 1, "x.hex", -1, 1, ("C1", None, None, None, None, None)), (1000000, 1, "A.hex", 1, 100, ("C1", None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset( csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE ) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != "" and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )