def test_NOPASS_GLM2_tweedie_rand2(self): h2o.beta_features = True if 1==1: csvPathname = 'standard/covtype.data' hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='put') else: csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() for trial in range(10): # params is mutable. This is default. params = { 'response': 54, 'lambda': 0, 'alpha': 0, 'n_folds': 1, 'family': 'tweedie' } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_tweedie_rand2(self): if 1 == 1: csvPathname = "standard/covtype.data" hex_key = "covtype.hex" parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, schema="put" ) else: csvPathname = "covtype/covtype.20k.data" hex_key = "covtype.20k.hex" parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=hex_key, schema="put") paramDict = define_params() for trial in range(10): # params is mutable. This is default. params = {"y": 54, "case": 4, "case_mode": "=", "lambda": 0, "alpha": 0, "n_folds": 1, "family": "tweedie"} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, "took", time.time() - start, "seconds" print "Trial #", trial, "completed\n"
def test_GLM2_airline(self): #############Train############################### csvFilename = 'AirlinesTrain.csv.zip' csvPathname = 'airlines'+'/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'} kwargs = params.copy() starttime = time.time() glmtest = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) elapsedtime = time.time() - starttime print("ELAPSED TIME TRAIN DATA ",elapsedtime) h2o_glm.simpleCheckGLM(self, glmtest, None, **kwargs) ######### Test ###################################### csvFilename = 'AirlinesTest.csv.zip' csvPathname = 'airlines'+'/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'} kwargs = params.copy() starttime = time.time() glmtrain = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) elapsedtime = time.time() - starttime print("ELAPSED TIME TEST DATA ",elapsedtime) h2o_glm.simpleCheckGLM(self, glmtrain, None, **kwargs)
def test_prostate_poisson(self): errors = [] # First try on small data (1 chunk) parseResult = h2i.import_parse( bucket="smalldata", path="logreg/prostate.csv", schema="put", hex_key="poisson_p" ) # R results poisson_coefficients = { "Intercept": -4.107484, "ID": 0.000508, "AGE": -0.004357, "RACE": -0.149412, "DPROS": 0.230458, "DCAPS": 0.071546, "PSA": 0.002944, "VOL": -0.007488, "GLEASON": 0.441659, } poisson_nd = 278.4 poisson_rd = 215.7 poisson_aic = 539.7 errors = self.process_dataset( parseResult, "CAPSULE", poisson_coefficients, poisson_nd, poisson_rd, poisson_aic, family="poisson" ) if errors: self.fail(str(errors)) # Now try on larger data (replicated), will be chunked this time, should produce same results parseResult = h2i.import_parse( bucket="smalldata", path="logreg/prostate_long.csv.gz", schema="put", hex_key="poisson_long_p" ) errors = self.process_dataset( parseResult, "CAPSULE", poisson_coefficients, poisson_nd, poisson_rd, poisson_aic, family="poisson" ) if errors: self.fail(str(errors))
def test_prostate_binomial(self): errors = [] # First try on small data (1 chunk) parseResult = h2i.import_parse( bucket="smalldata", path="logreg/prostate.csv", schema="put", hex_key="prostate_b" ) # R results binomial_coefficients = { "Intercept": -8.126278, "ID": 0.001609, "AGE": -0.008138, "RACE": -0.617597, "DPROS": 0.553065, "DCAPS": 0.546087, "PSA": 0.027297, "VOL": -0.011540, "GLEASON": 1.010125, } binomial_nd = 512.3 binomial_rd = 376.9 binomial_aic = 394.9 errors = self.process_dataset( parseResult, "CAPSULE", binomial_coefficients, binomial_nd, binomial_rd, binomial_aic, family="binomial" ) if errors: self.fail(str(errors)) # Now try on larger data (replicated), will be chunked this time, should produce same results parseResult = h2i.import_parse( bucket="smalldata", path="logreg/prostate_long.csv.gz", schema="put", hex_key="prostate_long_b" ) errors = self.process_dataset( parseResult, "CAPSULE", binomial_coefficients, binomial_nd, binomial_rd, binomial_aic, family="binomial" ) if errors: self.fail(str(errors))
def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def test_GLM2grid_covtype_many(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'gaussian', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "glmResult:", h2o.dump_json(glmResult) # assuming it doesn't complete right away, this is the first response # it differs for the last response job_key = glmResult['job_key'] grid_key = glmResult['destination_key'] jobs.append( (job_key, grid_key) ) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(4): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start # 2/GLMGridView.html?grid_key=asd # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1 for job_key, grid_key in jobs: gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key) h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_exec2_cbind_like_R(self): SYNDATASETS_DIR = h2o.make_syn_dir() SEEDPERFILE = random.randint(0, sys.maxint) rowCount = 30000 colCount = 150 timeoutSecs = 60 hex_key = "df" csvPathname = SYNDATASETS_DIR + "/" + "df.csv" write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) colCount = 1 hex_key = "indx" csvPathname = SYNDATASETS_DIR + "/" + "indx.csv" write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] for trial in range(10): for execExpr in exprList: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' h2o.check_sandbox_for_errors()
def test_C_RF_poker100(self): parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10) SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 60 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10
def test_GLM_mnist_s3n_fvec(self): csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] importFolderPath = "mnist" csvPathname = importFolderPath + "/*" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testHexKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # PARSE train**************************************** csvPathname = importFolderPath + "/" + trainCsvFilename trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'response': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def notest_C_Basic(self): # this will do an import folder and parse. schema='local' is default. doesn't need to be specified # I guess this will be relative to current wd ## if os env variable H2O_BUCKETS_ROOT is set, it will start looking there for bucket, then path ## that covers the case where "walking upward" is not sufficient for where you but the bucket (locally) os.environ['H2O_BUCKETS_ROOT'] = '/home' h2i.import_parse(path='dir3/syn_sphere_gen3.csv', bucket='my-bucket3', schema='local') del os.environ['H2O_BUCKETS_ROOT']
def test3(self): # h2i.import_parse(path='standard/covtype.data', bucket='home-0xdiag-datasets', schema="s3n", timeoutSecs=60) ## This will get it from import hdfs with s3n. the hdfs_name_node and hdfs_version for s3 # will have been passed at build_cloud, either from the test, or the <config>.json h2i.import_parse(path='standard/benign.csv', bucket='home-0xdiag-datasets', schema='s3n', timeoutSecs=60) # h2i.import_parse(path='leads.csv', bucket='datasets', schema="hdfs", timeoutSecs=60) # h2i.import_parse(path='/datasets/leads.csv', schema="hdfs", timeoutSecs=60) # h2i.import_parse(path='datasets/leads.csv', schema="hdfs", timeoutSecs=60) ## This will get it from import s3. h2i.import_parse(path='standard/benign.csv', bucket='home-0xdiag-datasets', schema='s3', timeoutSecs=60)
def test_rf_iris(self): # Train RF trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put') kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put') kwargs = paramsTestRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_GBMScore(self): h2o.beta_features = True importFolderPath = "standard" csvTrainPath = importFolderPath + "/allyears2k.csv" csvTestPath = csvTrainPath # importFolderPath = 'newairlines' # csvTrainPath = importFolderPath + '/train/*train*' # csvTestPath = importFolderPath + '/train/*test*' trainhex = "train.hex" testhex = "test.hex" parseTrainResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvTrainPath, schema="local", hex_key=trainhex, timeoutSecs=2400, doSummary=False, ) parseTestResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvTestPath, schema="local", hex_key=testhex, timeoutSecs=2400, doSummary=False, ) inspect_test = h2o.nodes[0].inspect(testhex, timeoutSecs=8000) response = "IsDepDelayed" ignored_cols = "DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed" params = { "destination_key": "GBMScore", "response": response, "ignored_cols_by_name": ignored_cols, "classification": 1, "validation": None, "ntrees": 100, "max_depth": 10, "learn_rate": 0.00005, } parseResult = {"destination_key": trainhex} kwargs = params.copy() gbm = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=4800, **kwargs) scoreStart = time.time() h2o.nodes[0].generate_predictions(model_key="GBMScore", data_key=trainhex) scoreElapsed = time.time() - scoreStart print "It took ", scoreElapsed, " seconds to score ", inspect_test[ "numRows" ], " rows. Using a GBM with 100 10-deep trees." print "That's ", 1.0 * scoreElapsed / 100.0, " seconds per 10-deep tree."
def test_H_Basic(self): # maybe best to extra the key from an import? first? # this isn't used much, maybe we don't care about this h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv") headerKey = h2i.find_key('syn_header.csv') # comma 44 is separator h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv", header=1, header_from_file=headerKey, separator=44) # symbolic links work # ln -s /home/0xdiag/datasets home-0xdiag-datasets # lrwxrwxrwx 1 kevin kevin 21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets h2i.import_parse(path="standard/covtype.data", bucket="home-0xdiag-datasets")
def test_C_kmeans_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex") h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): params = {'k': 3, 'initialization': 'Furthest', 'ignored_cols': "ID", 'destination_key': 'prostate_k.hex', 'max_iter': 100, 'seed': 265211114317615310 } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_rapids_basic(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'p' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for execExpr in exprList: r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr) resultKey = r.group(1) execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) if DO_ROLLUP: h2o_cmd.runInspect(key=resultKey) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr) else: h2p.yellow_print("\nNo key created?\n", dump_json(execResult)) print "\nExpressions that created keys. Shouldn't all of these expressions create keys" for k in keys: print k h2o.check_sandbox_for_errors()
def test_GLM_poisson_rand2(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*40) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_rapids_ifelse_nested(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(2): for execObj, expected in zip(objList, resultList): freshObj = copy(execObj) result = freshObj.do() # do some scalar result checking if expected is not None: # result is a string now?? print "result:", result print "expected:", expected assert float(result)==expected, "%s %s" (result,expected) # rows might be zero! print "freshObj:", dump_json(freshObj.execResult) if 'key' in freshObj.execResult and freshObj.execResult['key']: keys.append(freshObj.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_parse_1m_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [(10, 65000, "cH", 30)] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() print "Summary should work with 65k" parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) # should match # of cols in header or ?? self.assertEqual( inspect["num_cols"], colCount, "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount), ) self.assertEqual( inspect["num_rows"], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect["num_rows"], rowCount), ) # we should obey max_column_display column_limits = [25, 25000, 50000] for column_limit in column_limits: inspect = h2o_cmd.runInspect( None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs ) self.assertEqual( len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit) ) for r in range(0, len(inspect["rows"])): # NB: +1 below because each row includes a row header row: #{row} self.assertEqual( len(inspect["rows"][r]), column_limit + 1, "inspect data rows obeys max_column_display = " + str(column_limit), )
def test_GLM2_tweedie(self): csvFilename = "AutoClaim.csv" csvPathname = 'standard/' + csvFilename print "\nStarting", csvPathname parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') # columns start at 0 # regress: glm(CLM_AMT ~ CAR_USE + REVOLKED + GENDER + AREA + MARRIED + CAR_TYPE, data=AutoClaim, family=tweedie(1.34)) coefs = [7, 13, 20, 27, 21, 11] y = 4 ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult['destination_key'], cols=coefs, response=y) # sapply(c('CLM_AMT', 'CAR_USE', 'REVOLKED', 'GENDER', 'AREA', 'MARRIED', 'CAR_TYPE'), function(x) which(x==colnames(AutoClaim)) - 1) kwargs = { 'family': 'tweedie', 'tweedie_variance_power': 1.36, 'response': y, 'ignored_cols' : ignored_cols, 'max_iter': 10, 'lambda': 0, 'alpha': 0, 'n_folds': 0, 'beta_epsilon': 1e-4, } glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) coefficientsExpected = {'Intercept': 0, 'GENDER.M': 0.0014842488782470984, 'CAR_TYPE.Sports Car': 0.07786742314454961, 'MARRIED.Yes': 0.0007748552195851079, 'CAR_TYPE.SUV': 0.07267702940249621, 'CAR_TYPE.Pickup': 0.04952083408742968, 'CAR_TYPE.Van': 0.026422137690691405, 'CAR_TYPE.Sedan': 0.05128350794060489, 'CAR_USE.Private': -0.03050194832853935, 'REVOLKED.Yes': -0.05095942737408699} deltaExpected = 0.05 (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, coefficientsExpected=coefficientsExpected, deltaExpected=deltaExpected, **kwargs) print 'coefficients: %s' % (str(coefficients))
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange (10,100,10): shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange (10,100,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def import_frame(self, target_key, bucket, csvFilename, csvPathname, expected_rows, expected_cols): path = csvPathname + '/' + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=path, hex_key=target_key, schema='put') # upload the file destination_key = parseResult['destination_key'] # we block until it's actually ready inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) actual_rows = inspect['numRows'] actual_cols = inspect['numCols'] print 'loaded frame "' + target_key +'" from path: ' + path print 'rows: ', actual_rows print 'cols: ', actual_cols # Don't have access to the testCase assert methods here because they aren't class methods. :-( assert expected_rows == actual_rows, "Expected " + str(expected_rows) + " but got " + str(actual_rows) + " for path: " + path assert expected_cols == actual_cols, "Expected " + str(expected_cols) + " but got " + str(actual_cols) + " for path: " + path # TODO: other info we could check # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) # # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) # h2o_cmd.infoFromSummary(summaryResult) # , noPrint=True return destination_key
def test_B_kmeans_benign(self): h2o.beta_features = True csvPathname = "logreg" csvFilename = "benign.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): params = {'k': 3, 'initialization': 'Furthest', 'ignored_cols' : None, 'destination_key': 'benign_k.hex', 'max_iter': 50, 'seed': 265211114317615310, } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_exec2_poppush2_fail(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) exprList = [] while (len(exprList)!=20): exprs = [random.choice(phrases) for j in range(random.randint(1,2))] # check if we have mean2() before function defn functionFound = False for i, e in enumerate(exprs): if 'function' in e: functionFound = True # h2o has problems with assigns after functions if functionFound and len(exprs)> 1: # pass exprList.append("".join(exprs)) else: exprList.append("".join(exprs)) # add this one for good measure (known fail) # exprList += "crunk=function(x){x+98};r.hex[,3]=4;" exprList += ["function(x){x+98};r.hex[,3]=4;"] for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
def test_exec2_ddply_phrases(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' # csvPathname = 'standard/covtype.data' csvPathname = "standard/covtype.shuffled.10pct.data" hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey) for col in range(1,10): initList = [ ('r.hex', 'r.hex=i.hex'), (None, "func1=function(x){max(x[,%s])}" % col), (None, "func2=function(x){a=3;nrow(x[,%s])*a}" % col), (None, "func3=function(x){apply(x[,%s],2,sum)/nrow(x[,%s])}" % (col, col) ), # (None, "function(x) { cbind( mean(x[,1]), mean(x[,%s]) ) }" % col), (None, "func4=function(x) { mean( x[,%s]) }" % col), (None, "func5=function(x) { sd( x[,%s]) }" % col), # (None, "func6=function(x) { quantile(x[,%s] , c(0.9) ) }" % col), ] for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) for p in phrases: execExpr = "ddply(r.hex, c(2), " + p + ")" h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_NOPASS_GLM2_weight_nan_fail(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') kwargs = { 'destination_key': 'GLM_model_python_0_default_0', 'family': 'tweedie', 'tweedie_variance_power': 1.9999999, 'max_iter': 10, 'alpha': 0, 'lambda': 0, 'response': 54, } for trial in range(3): # params is mutable. This is default. start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) h2o.check_sandbox_for_errors() # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_parse_bad_30rows_fvec(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() csvPathname = SYNDATASETS_DIR + "/bad.data" dsf = open(csvPathname, "w+") dsf.write(datalines) dsf.close() for i in range(20): # every other one single_quotes = 1 # force header=1 to make it not fail (doesn't deduce correctly) parseResult = h2i.import_parse( path=csvPathname, schema="put", single_quotes=single_quotes, header=1, hex_key="trial" + str(i) + ".hex" ) inspect = h2o_cmd.runInspect(key=parseResult["destination_key"]) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) numRows = inspect["numRows"] numCols = inspect["numCols"] self.assertEqual(numCols, 4, "Parsed wrong number of cols: %s" % numCols) self.assertNotEqual( numRows, 30, "Parsed wrong number of rows. Should be 29.\ Didn't deduce header?: %s" % numRows, ) self.assertEqual(numRows, 29, "Parsed wrong number of rows: %s" % numRows)
def test_parse_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 60), (100, 6000, 'cB', 60), (100, 7000, 'cC', 60), (100, 8000, 'cD', 60), (100, 8200, 'cE', 60), (100, 8500, 'cF', 60), (100, 9000, 'cG', 60), (100, 10000, 'cI', 60), (100, 11000, 'cH', 60), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GLM2_covtype_train_predict_all_all(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1) # class 1 h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "Use same data (full) for train and test" trainDataKey = "A.hex" testDataKey = "A.hex" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y + 1), 'max_iter': 20, 'n_folds': 0, # 'alpha': 0.1, # 'lambda': 1e-5, 'alpha': 0.0, 'lambda': None, 'family': 'binomial', } timeoutSecs = 60 for trial in range(1): # test/train split **********************************************8 aHack = {'destination_key': trainDataKey} # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertEqual( pctWrong, trainPctWrong, "Should see the same error rate on train and predict? (same data set)" ) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GBM_basic(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' model_key = 'GBMModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename # FIX! do I need to force enum for classification? what if I do regression after this? columnTypeDict = {54: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, schema='local', chunk_size=4194304, hex_key=train_key, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols # run through a couple of parameter sets parameters = [] parameters.append({ 'response_column': 'C55', 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, 'loss': 'multinomial', # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': }) parameters.append({ 'response_column': 'C55', 'loss': 'multinomial', # This does nothing! intent is solely based on type of response col 'ntrees': 1, 'max_depth': 20, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, }) model_key = 'covtype_gbm.hex' for p in parameters: bmResult = h2o.n0.build_model(algo='gbm', destination_key=model_key, training_frame=train_key, validation_frame=train_key, parameters=p, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n" vis = OutputObj(model.variable_importances, 'vis') # just the first 10 visDataChopped = [v[0:9] for v in vis.data] names = visDataChopped[0] relativeImportance = visDataChopped[1] print "names:", names print "relativeImportance:", relativeImportance scaledImportance = visDataChopped[2] percentage = visDataChopped[3] print "\nvis\n", tabulate(visDataChopped[1:], headers=names) # print "\nrelativeImportance (10)\n", tabulate(relativeImportance, headers=names) # print "\nscaledImportance (10)\n", tabulate(scaledImportance, headers=names) # print "\npercentage (10)\n", tabulate(percentage, headers=names) print "will say Regression or Classification. no Multinomial?" print "model.model_category", model.model_category assert model.model_category == 'Multinomial', model.model_category print "FIX! why is mse 0 and mes_train Nan?" print "model.mse:", model.mse print "model.mse_train:", model.mse_train if 1 == 1: print "" for i, c in enumerate(cmm.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GLM2_basic(self): importFolderPath = "logreg" csvFilename = 'prostate.csv' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print inspect print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) x = 'ID' y = 'CAPSULE' family = 'binomial' alpha = '0.5' lambda_ = '1E-4' nfolds = '0' f = 'prostate' modelKey = 'GLM_' + f kwargs = { 'response': y, 'ignored_cols': x, 'family': family, 'lambda': lambda_, 'alpha': alpha, 'n_folds': nfolds, # passes if 0, fails otherwise 'destination_key': modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=0.25, pollTimeoutSecs=180, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) if 1 == 0: job_key = glmResult['job_key'] # is the job finishing before polling would say it's done? params = {'job_key': job_key, 'destination_key': modelKey} glm = h2o.nodes[0].completion_redirect( jsonRequest="2/GLMProgressPage2.json", params=params) print "GLM result from completion_redirect:", h2o.dump_json(a) if 1 == 1: glm = h2o.nodes[0].glm_view(_modelKey=modelKey) ### print "GLM result from glm_view:", h2o.dump_json(a) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) glm_model = glm['glm_model'] _names = glm_model['_names'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc
def test_KMeans_predict3(self): SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 1: outputClasses = 3 y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # Huh...now we apparently need the translate. Used to be: # No translate because we're using an Exec to get the data out?, and that loses the encoding? # translate = None # FIX! how do we know what the translate should be, when we predict? translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1 == 0: outputClasses = 6 y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: outputClasses = 6 y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: outputClasses = 6 y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 else: outputClasses = 10 y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, predictHexKey, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like # FIX! apparently we lose the enum mapping when we slice out, and then csv download? we just get the number? # OH NO..it looks like we actually preserve the enum..it's in the csv downloaded # the prediction is the one that doesn't have it, because it's realated to clusters, which have no # notion of output classes h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predictResult = h2o.nodes[0].generate_predictions( model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time( ) - start, 'seconds' print "predictResult:", h2o.dump_json(predictResult) h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col( csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col( csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? # hack..need to fix this if 1 == 0: if pctWrong > 2.0: raise Exception( "pctWrong too high. Expect < 2% error because it's reusing training data" ) return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect["numCols"] numRows = inspect["numRows"] seed = random.randint(0, sys.maxint) # should pass seed # want to ignore the response col? we compare that to predicted # if we tell kmeans to ignore a column here, and then use the model on the same dataset to predict # does the column get ignored? (this is last col, trickier if first col. (are the centers "right" kwargs = { 'ignored_cols_by_name': response, 'seed': seed, # "seed": 4294494033083512223, 'k': outputClasses, 'initialization': 'PlusPlus', # sometimes get [24, 29, 97] result with PlusPlus. # change to Furthest. maybe have to fix the seed above, but we'll see # I provide two legal results below 'destination_key': 'kmeans_model', 'max_iter': 1000 } kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=60, **kwargs) # this is what the size of each cluster was, when reported by training size = kmeans['model']['size'] # tupleResultList is created like this: ( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) ) # THIS DOES A PREDICT in it (we used to have to do the predict to get more training result info?) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) # the tupleResultList has the size during predict? compare it to the sizes during training # I assume they're in the same order. size2 = [t[1] for t in tupleResultList] if size != size2: raise Exception( "training cluster sizes: %s are not the same as what we got from predict on same data: %s", (size, size2)) # hack...hardwire for iris here # keep this with sizes sorted expectedSizes = [ [39, 50, 61], [38, 50, 62], # these are bad results that we get once in a while [22, 31, 97], [24, 29, 97], [24, 30, 96], [23, 31, 96], ] sortedSize = sorted(size) if sortedSize not in expectedSizes: raise Exception( "I got cluster sizes %s but expected one of these: %s " % (sortedSize, expectedSizes)) # check center list (first center) has same number of cols as source data print "centers:", centers # we said to ignore the output so subtract one from expected self.assertEqual( numCols - 1, len(centers[0]), "kmeans first center doesn't have same # of values as dataset row %s %s" % (numCols - 1, len(centers[0]))) # FIX! add expected # h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) error = kmeans['model']['total_within_SS'] within_cluster_variances = kmeans['model']['within_cluster_variances'] print "within_cluster_variances:", within_cluster_variances print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y # zero-based index matches response col name print "" print "oh I see why I can't compare predict to actual, in kmeans" print "the cluster order doesn't have to match the output class enum order" print "so I don't know what cluster, each output class will be (kmeans)" print "all I can say is that the prediction distribution should match the original source distribution" print "have to figure out what to do" predictHexKey = 'predict_0.hex' pctWrong = predict_and_compare_csvs(model_key='kmeans_model', hex_key=hexKey, predictHexKey=predictHexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now # HACK ignoring error for now if 1 == 0: self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=0.7, msg= "predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True, expectedAuc=0.5): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial % 10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' response = "C55" h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols - 1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str( trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that) rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, validation=dataKeyTest, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest, modelKey=model_key, vactual=response, vpredict=1, expectedAuc=expectedAuc) print h2o.dump_json(rfvScoring) h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) print "hello7" (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / numRows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring
def test_parse_covtype_2_maprfs(self): csvFilenameAll = [ # this was from a hdfs dfs -ls /datasets. ..bytes ("covtype.data", 75169317), ("TEST-poker1000.csv", 23582), ("WU_100KRows3KCols.csv", 1120591148), ("airlines_all.05p.csv", 607774430), ("and-testing.data", 23538333), ("arcene2_train.both", 2715738), ("arcene_train.both", 2715838), # ("bestbuy_test.csv", 152488777), # ("bestbuy_train.csv", 243806953), ("billion_rows.csv.gz", 1758523515), ("covtype.13x.data", 977210917), ("covtype.13x.shuffle.data", 977210917), ("covtype.4x.shuffle.data", 300678693), ("covtype4x.shuffle.data", 300678693), ("hhp.unbalanced.012.1x11.data.gz", 6566953), ("hhp.unbalanced.012.data.gz", 4233715), ("hhp.unbalanced.data.gz", 4235293), ("hhp2.os.noisy.0_1.data", 48381802), ("hhp2.os.noisy.9_4.data", 48397103), ("leads.csv", 2755), ("prostate_long_1G.csv", 1115287100), # ("3G_poker_shuffle", 3145728000), # ("covtype.169x.data", 12703751717), ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll for (csvFilename, totalBytes) in csvFilenameList: totalBytes = float(totalBytes) timeoutSecs = 900 multiplyExpected = 1 # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) importFolderPath = "datasets" csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='maprfs', timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f}MB {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, (totalBytes+0.0)/1e6, fileMBS, elapsed) print "\n"+l # h2o.cloudPerfH2O.message(l) # chunk_size=4194304*2 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(1): print "Summary on column", i co = h2o_cmd.runSummary(key=parse_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] # print "parseResult:", dump_json(parseResult) a_node = h2o.nodes[0] frames_result = a_node.frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) # FIX! switch this to look at the summary result parseKeyIndexedCheck(frames_result, multiplyExpected) # don't want to spill keys h2o.nodes[0].remove_all_keys()
def test_GLM2_convergence_1(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, 'cD', 300), (100, 100, 'cE', 300), (100, 200, 'cF', 300), (100, 300, 'cG', 300), (100, 400, 'cH', 300), (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put') print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 10, 'lambda': 1e-8, 'alpha': 0, 'n_folds': 0, 'beta_epsilon': 1e-4, } kwargs['response'] = y emsg = None # FIX! how much should we loop here. for i in range(3): start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, allowFailWarning=True, **kwargs) if 1==0: print "\n", "\ncoefficients in col order:" # since we're loading the x50 file all the time..the real colCount # should be 50 (0 to 49) showCols = colCount for c in range(showCols): print "%s:\t%.6e" % (c, coefficients[c]) print "intercept:\t %.6e" % intercept # gets the failed to converge, here, after we see it in the browser too x = re.compile("[Ff]ailed") if warnings: for w in warnings: if (re.search(x,w)): # first if emsg is None: emsg = w print w if emsg: break if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def test_ddply_plot_multi(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 30, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 70, 30), (1000000, 5, 'cD', 0, 100, 30), (1000000, 5, 'cD', 0, 130, 30), (1000000, 5, 'cD', 0, 160, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) groups = execResult['num_rows'] maxExpectedGroups = ((maxInt - minInt) + 1)**2 h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) # should be same answer in both cases execExpr = "d=sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) print "execResult", h2o.dump_json(execResult) self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c5_KMeans_sphere_26GB(self): h2o.beta_features = False # a kludge h2o.setup_benchmark_log() csvFilename = 'syn_sphere_gen.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse( path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse( path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 3, 'normalize': 1, 'initialization': 'Furthest', 'destination_key': 'junk.hex', # we get NaNs if whole col is NA 'cols': 'C1, C2, C3, C4, C5, C6, C7', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeans) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial) h2i.delete_keys_at_all_nodes()
def test_kmeans_benign(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right? print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename expected = [ ([ 8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57 ], 49, None), ([ 33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79 ], 87, None), ([ 27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53 ], 55, None), ([ 26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56 ], 9, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(1): kmeansSeed = random.randint(0, sys.maxint) # kmeansSeed = 6655548259421773879 kwargs = { 'k': 4, 'initialization': 'PlusPlus', 'destination_key': 'benign_k.hex', # 'seed': 265211114317615310, 'max_iter': 50, 'seed': kmeansSeed, } kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans)) modelView = h2o.nodes[0].kmeans_view(model='benign_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['centers'] within_cluster_variances = model['within_cluster_variances'] total_within_SS = model['total_within_SS'] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS # make this fvec legal? (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_kmeans_prostate(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955), ([0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045), ([0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(1): # kmeansSeed = random.randint(0, sys.maxint) # actually can get a slightly better error sum with a different seed # this seed gets the same result as scikit kmeansSeed = 6655548259421773879 kwargs = { 'ignored_cols': 'ID', 'k': 3, # 'initialization': 'Furthest', 'initialization': 'PlusPlus', 'destination_key': 'prostate_k.hex', 'max_iter': 500, 'seed': kmeansSeed, # reuse the same seed, to get deterministic results (otherwise sometimes fails # 'seed': 265211114317615310} } # for fvec only? kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) # FIX! how do I get the kmeans result? ### print "kmeans result:", h2o.dump_json(kmeans) # can't do this # inspect = h2o_cmd.runInspect(key='prostate_k.hex') modelView = h2o.nodes[0].kmeans_view(model='prostate_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['centers'] within_cluster_variances = model['within_cluster_variances'] total_within_SS = model['total_within_SS'] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing, co.domain, sum(co.bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def test_parse_summary_manyfiles_1_fvec(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_1.dat.gz" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** # pass numRows, so we know when na cnt means row is all na's summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, numCols=numCols, numRows=numRows) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_parse_time(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = None colCount = COLS # rowCount = 1000 rowCount = ROWS write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range (20): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname) print "missingValuesListA", missingValuesListA numColsA = inspect['numCols'] numRowsA = inspect['numRows'] byteSizeA = inspect['byteSize'] self.assertEqual(missingValuesListA, [], "missingValuesList should be empty") self.assertEqual(numColsA, colCount) self.assertEqual(numRowsA, rowCount) # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) # interesting. what happens when we do csv download with time data? start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname) print "missingValuesListB", missingValuesListB numColsB = inspect['numCols'] numRowsB = inspect['numRows'] byteSizeB = inspect['byteSize'] self.assertEqual(missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result") self.assertEqual(numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # so I guess that's okay. So allow for an extra row here. self.assertEqual(numRowsA, numRowsB, "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) ) print "H2O writes the internal format (number) out for time." # ==> syn_time.csv <== # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # ==> csvDownload.csv <== # "0","1","2","3","4","5" # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12 if 1==0: # extra line for column headers? self.assertEqual(byteSizeA, byteSizeB, "byteSize mismatches after re-parse of downloadCsv result %d %d" % (byteSizeA, byteSizeB) ) # FIX! should do some comparison of values? # maybe can use exec to checksum the columns and compare column list. # or compare to expected values? (what are the expected values for the number for time inside h2o?) # FIX! should compare the results of the two parses. The infoFromInspect result? ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def sub_c2_fvec_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 116561140 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameList= [ ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern # double import still causing problems? # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # importFullList = importResult['files'] # importFailList = importResult['fails'] # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]: x.remove(i) ignore_x.append(i) # plus 1 because we are no longer 0 offset x = ",".join(map(lambda x: "C" + str(x+1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'family': 'binomial', 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_GLM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (2, 100, 'cA', 300), # (4, 200, 'cA', 300), (10000, 1000, 'cB', 300), (10000, 3000, 'cC', 500), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) response = 'C' + str(len(labelListUsed) - 1) # last column labelListUsed.remove(response) numColsUsed = numCols - 1 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': response, # can't take index now? # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, # 'use_all_factor_levels': False, } model_key = 'many_cols_glm.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=300) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_RF_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'RFModelKey' # Parse (train)**************************************** start = time.time() parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # RF(train iterate)**************************************** ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'nbins': 1024, 'classification': 1, 'ntrees': ntrees, 'max_depth': max_depth, 'response': 'C' + str(numCols - 1), 'ignored_cols_by_name': None, } print "Using these parameters for RF: ", params kwargs = params.copy() trainStart = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str( max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) rfResult["drf_model"] = rfResult.pop("speedrf_model") errsLast = rfResult['drf_model']['errs'][-1] print "RF 'errsLast'", errsLast cm = rfResult['drf_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1 == 1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c9_GLM_airlines_fvec(self): files = [ ('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 3, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, 'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GLM_allstate_s3n_thru_hdfs(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'allstate' csvFilename = "train_set.csv" csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 500 trialMax = 3 for trial in range(trialMax): trialStart = time.time() hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse end on ", hex_key, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { # allstate claim last col 'y': 34, 'case_mode': '>', 'case': 0, 'family': 'binomial', 'link': 'logit', 'n_folds': 2, 'max_iter': 8, 'beta_epsilon': 1e-3 } timeoutSecs = 500 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noise=('JStack', None), **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noise=('JStack', None), **kwargs) elapsed = time.time() - start print "glm (Elastic) end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noise=('JStack', None), **kwargs) elapsed = time.time() - start print "glm (L1) end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_four_billion_rows_fvec(self): h2o.beta_features = True timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, pollTimeoutSecs=180) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] byteSize = inspect['byteSize'] print "\n" + csvFilename, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols), \ " byteSize:", "{:,}".format(byteSize) expectedRowSize = numCols * 1 # plus output # expectedValueSize = expectedRowSize * numRows expectedValueSize = 8001271520 self.assertEqual(byteSize, expectedValueSize, msg='byteSize %s is not expected: %s' % \ (byteSize, expectedValueSize)) summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, numCols, msg="generated %s cols (including output). parsed to %s cols" % (2, numCols)) self.assertEqual(4 * 1000000000, numRows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, numRows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'max_iter': 4, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'response': 'C1', 'n_folds': 0, 'family': 'binomial', } # one coefficient is checked a little more colX = 1 # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('C1', 'C1', 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) aHack = {'destination_key': "A.hex"} # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
def test_many_fp_formats_libsvm_fvec(self): h2o.beta_features = True # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30, 'sparse'), (100, 100, 'cF', 30, 'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use numCols?. numCols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols)) syn = {} if k == 0: syn['name'] = "C1" syn['type'] = {'Int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn['name'] = "C2" syn['type'] = {'Int'} syn['min'] = 0 syn['max'] = 0 # syn['scale'] = {1} else: syn['name'] = "C" + str(k + 1) syn['type'] = {'Int', 'Real'} syn['min'] = valMin syn['max'] = valMax # syn['scale'] = {1,10,100,1000} syn['naCnt'] = 0 syn['cardinality'] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue( syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue( syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'type': if cols[synKey] not in syn[synKey]: print "cols min/max:", cols['min'], cols['max'] print "syn min/max:", syn['min'], syn['max'] raise Exception( 'col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual( syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def GLM_syn_eqns_data(self, ALGO='binomial', DATA_VALUE_MIN=-1, DATA_VALUE_MAX=1, COEFF_VALUE_MIN=-1, COEFF_VALUE_MAX=1, INTCPT_VALUE_MIN=-1, INTCPT_VALUE_MAX=1, DATA_DISTS='unique_pos_neg'): SYNDATASETS_DIR = h2o.make_syn_dir() if ALGO == 'poisson': tryList = [ (50000, 5, 'cD', 300), ] else: tryList = [ # (100, 1, 'cA', 300), # (100, 25, 'cB', 300), # (1000, 25, 'cC', 300), # 50 fails, 40 fails # (10000, 50, 'cD', 300), # 30 passes # (10000, 30, 'cD', 300), # 200 passed (500, 30, 'cD', 300), (500, 30, 'cD', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: modeString = \ "_Bins" + str(BINS) + \ "_Dmin" + str(DATA_VALUE_MIN) + \ "_Dmax" + str(DATA_VALUE_MAX) + \ "_Cmin" + str(COEFF_VALUE_MIN) + \ "_Cmax" + str(COEFF_VALUE_MAX) + \ "_Imin" + str(INTCPT_VALUE_MIN) + \ "_Imax" + str(INTCPT_VALUE_MAX) + \ "_Ddist" + str(DATA_DISTS) print "modeString:", modeString SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + modeString + "_" + str( SEEDPERFILE) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, \ "using random coefficients and intercept and logit eqn. for output" (coefficientsGen, interceptGen) = gen_rand_equation( colCount, INTCPT_VALUE_MIN, INTCPT_VALUE_MAX, COEFF_VALUE_MIN, COEFF_VALUE_MAX, SEEDPERFILE) print coefficientsGen, interceptGen write_syn_dataset(csvPathname, rowCount, colCount, coefficientsGen, interceptGen, DATA_VALUE_MIN, DATA_VALUE_MAX, DATA_DISTS, ALGO, SEED) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=60) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount print "GLM is ignoring the thresholds I give it? deciding what's best?" kwargs = { 'family': ALGO, 'y': y, 'max_iter': 10, 'lambda': 0, 'alpha': 0, 'n_folds': 0, 'beta_epsilon': 1e-4, # 'thresholds': 0.5, } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 'C1', **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' if ALGO == 'binomial': deltaCoeff = 0.1 deltaIntcpt = 0.2 else: # poisson needs more? deltaCoeff = 0.4 deltaIntcpt = 1.0 for i, c in enumerate(coefficients): g = coefficientsGen[i] # generated print "coefficient[%d]: %8.4f, generated: %8.4f, delta: %8.4f" % ( i, c, g, abs(g - c)) self.assertAlmostEqual( c, g, delta=deltaCoeff, msg="not close enough. coefficient[%d]: %s, generated %s" % (i, c, g)) c = intercept g = interceptGen print "intercept: %8.4f, generated: %8.4f, delta: %8.4f" % ( c, g, abs(g - c)) print "need a larger delta compare for intercept?" self.assertAlmostEqual( c, g, delta=deltaIntcpt, msg="not close enough. intercept: %s, generated %s" % (c, g))
def test_DL_airlines_small(self): h2o.nodes[0].remove_all_keys() csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'train.hex' validation_key = 'validation.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=False) pAV = h2o_cmd.ParseObj(parseResultV) iAV = h2o_cmd.InspectObj(pAV.parse_key) #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' parameters = { 'validation_frame': validation_key, # KeyIndexed None 'ignored_columns': "['IsDepDelayed_REC']", # string[] None 'response_column': 'IsDepDelayed', # string None 'loss': 'CrossEntropy' } expectedErr = 0.32 ## expected validation error for the above model relTol = 0.15 ## 15% rel. error tolerance due to Hogwild! timeoutSecs = 60 start = time.time() bmResult = h2o.n0.build_model(algo='deeplearning', destination_key=model_key, training_frame=hex_key, parameters=parameters, timeoutSecs=timeoutSecs) bm = OutputObj(bmResult, 'bm') print 'deep learning took', time.time() - start, 'seconds' modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') # print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=validation_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() actualErr = model['errors']['valid_err'] print "expected classification error: " + format(expectedErr) print "actual classification error: " + format(actualErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_hdfs_cdh5(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle" ("and-testing.data", 60), ### "arcene2_train.both", ### "arcene_train.both", ### "bestbuy_test.csv", ("covtype.data", 60), ("covtype4x.shuffle.data", 60), # "four_billion_rows.csv", ("hhp.unbalanced.012.data.gz", 60), ("hhp.unbalanced.data.gz", 60), ("leads.csv", 60), # ("covtype.169x.data", 1200), ("prostate_long_1G.csv", 200), ("airlines_all.csv", 1200), ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() trial = 0 print "try importing /tmp2" d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000) for (csvFilename, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a.hex" csvPathname = "datasets/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs parse of", csvPathname, "took", time.time( ) - start, 'secs' pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList if DO_EXPORT: start = time.time() print "Saving", csvFilename, 'to HDFS' print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)" print "Unique per-user to avoid permission issues" username = getpass.getuser() csvPathname = "tmp2/a%s.%s.csv" % (trial, username) # reuse the file name to avoid running out of space csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username) path = "hdfs://" + h2o.nodes[ 0].hdfs_name_node + "/" + csvPathname h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs) print "export_files of", hex_key, "to", path, "took", time.time( ) - start, 'secs' trial += 1 print "Re-Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a2.hex" time.sleep(2) d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs re-parse of", csvPathname, "took", time.time( ) - start, 'secs'
def test_GLM_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() csvPathname = importFolderPath + "/" + trainCsvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_fp_many_cols_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() if H2O_SUPPORTS_OVER_500K_COLS: tryList = [ (100, 200000, 'cG', 120, 120), (100, 300000, 'cH', 120, 120), (100, 400000, 'cI', 120, 120), (100, 500000, 'cJ', 120, 120), (100, 700000, 'cL', 120, 120), (100, 800000, 'cM', 120, 120), (100, 900000, 'cN', 120, 120), (100, 1000000, 'cO', 120, 120), (100, 1200000, 'cK', 120, 120), ] else: print "Restricting number of columns tested to <=500,000" tryList = [ (100, 200000, 'cG', 400, 400), (100, 300000, 'cH', 400, 400), (100, 400000, 'cI', 400, 400), (100, 500000, 'cJ', 400, 400), ] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) sel = 0 csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) h2o.check_sandbox_for_errors() print "Parse and summary:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount))
def test_GBM_basic_regress(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' model_key = 'GBMModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=train_key, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': None, 'response_column': 'C55', # 'balance_classes': # 'max_after_balance_size': 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': } model_key = 'covtype_gbm.hex' bmResult = h2o.n0.build_model(algo='gbm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # just check that it's something non-zero # assert cmm.cm['prediction_error']!=0.0 mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_csv_download_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5000, 10000, 'cK', 60), (10000, 10000, 'cL', 60), (50000, 10000, 'cM', 60), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: trial += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) print "\nA Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs) missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname) num_colsA = inspect['num_cols'] num_rowsA = inspect['num_rows'] row_sizeA = inspect['row_size'] value_size_bytesA = inspect['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" print "\nStarting csv download to", csvDownloadPathname, "rowCount:", rowCount, "colCount:", colCount start = time.time() h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) print "csv_download end.", 'took', time.time( ) - start, 'seconds. Originally from:', csvFilename # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) print "\nB Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs) missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result %d %d" % (num_colsA, num_colsB)) self.assertEqual( num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result %d %d" % (num_rowsA, num_rowsB)) self.assertEqual( row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) self.assertEqual( value_size_bytesA, value_size_bytesB, "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) h2o.check_sandbox_for_errors()
def test_rf_parity_cmp(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [50000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j, k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual( totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual( elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF") # always match the gen above! for trial in range(1): csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) numCols = inspect['numCols'] numRows = inspect['numRows'] response = "C" + str(numCols) ntrees = 30 doBoth() print "*****************************" print "end # %s RF compare" % trial, print "*****************************" print "Now change all cols to enums" for e in range(numCols): enumResult = h2o.nodes[0].to_enum(src_key=hex_key, column_index=(e + 1)) doBoth() print "*********************************" print "end # %s RF compare, with enums #" % trial, print "*********************************"
def test_impute_with_na(self): h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just insert some NAs and see what happens" inspect = h2o_cmd.runInspect(key=hex_key) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] missing_fraction = 0.5 # NOT ALLOWED TO SET AN ENUM COL? if 1 == 0: # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec? # just one in row 1 for enumCol in enumColList: print "hack: Putting NA in row 0 of col %s" % enumCol execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after exec:", missingValuesList if len(missingValuesList) != len(enumColList): raise Exception( "Didn't get missing values in expected number of cols: %s %s" % (enumColList, missingValuesList)) for trial in range(1): # copy the dataset hex_key2 = 'c.hex' execExpr = '%s = %s' % (hex_key2, hex_key) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) imvResult = h2o.nodes[0].insert_missing_values( key=hex_key2, missing_fraction=missing_fraction, seed=SEED) print "imvResult", h2o.dump_json(imvResult) # maybe make the output col a factor column # maybe one of the 0,1 cols too? # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns. # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3) print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before" expectedMissing = missing_fraction * origNumRows # per col enumColList = [49, 50, 51, 52, 53, 54] for e in enumColList: enumResult = h2o.nodes[0].to_enum(src_key=hex_key2, column_index=(e + 1)) inspect = h2o_cmd.runInspect(key=hex_key2) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(origNumRows, numRows) self.assertEqual(origNumCols, numCols) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList", missingValuesList # this is an approximation because we can't force an exact # of missing using insert_missing_values if len(missingValuesList) != numCols: raise Exception( "Why is missingValuesList not right afer ToEnum2?: %s %s" % (enumColList, missingValuesList)) for mv in missingValuesList: h2o_util.assertApproxEqual( mv, expectedMissing, rel=0.1 * mv, msg='mv %s is not approx. expected %s' % (mv, expectedMissing)) summaryResult = h2o_cmd.runSummary(key=hex_key2) h2o_cmd.infoFromSummary(summaryResult) # h2o_cmd.infoFromSummary(summaryResult) print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect" print "trial", trial print "expectedMissing:", expectedMissing print "Now get rid of all the missing values, by imputing means. We know all columns should have NAs from above" print "Do the columns in random order" # don't do the enum cols ..impute doesn't support right? if AVOID_BUG: shuffledColList = range(0, 49) # 0 to 48 execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # summaryResult = h2o_cmd.runSummary(key=hex_key2) # h2o_cmd.infoFromSummary(summaryResult) inspect = h2o_cmd.runInspect(key=hex_key2) numCols = inspect['numCols'] missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if len(missingValuesList) != 49: raise Exception( "expected missing values in all cols after pruning enum cols: %s" % missingValuesList) else: shuffledColList = range(0, 55) # 0 to 54 origInspect = inspect random.shuffle(shuffledColList) for column in shuffledColList: # get a random set of column. no duplicate. random order? 0 is okay? will be [] groupBy = random.sample(range(55), random.randint(0, 54)) # header names start with 1, not 0. Empty string if [] groupByNames = ",".join( map(lambda x: "C" + str(x + 1), groupBy)) # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap columnName = "C%s" % (column + 1) print "don't use mode if col isn't enum" badChoices = True while badChoices: method = random.choice(["mean", "median", "mode"]) badChoices = column not in enumColList and method == "mode" NEWSEED = random.randint(0, sys.maxint) print "does impute modify the source key?" # we get h2o error (argument exception) if no NAs impResult = h2o.nodes[0].impute(source=hex_key2, column=column, method=method) print "Now check that there are no missing values" print "FIX! broken..insert missing values doesn't insert NAs in enum cols" inspect = h2o_cmd.runInspect(key=hex_key2) numRows2 = inspect['numRows'] numCols2 = inspect['numCols'] self.assertEqual( numRows, numRows2, "imput shouldn't have changed frame numRows: %s %s" % (numRows, numRows2)) self.assertEqual( numCols, numCols2, "imput shouldn't have changed frame numCols: %s %s" % (numCols, numCols2)) # check that the mean didn't change for the col # the enum cols with mode, we'll have to think of something else missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if missingValuesList: raise Exception( "Not expecting any missing values after imputing all cols: %s" % missingValuesList) cols = inspect['cols'] origCols = origInspect['cols'] print "\nFIX! ignoring these errors. have to figure out why." for i, (c, oc) in enumerate(zip(cols, origCols)): # I suppose since we impute to either median or mean, we can't assume the mean stays the same # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true ### h2o_util.assertApproxEqual(c['mean'], oc['mean'], tol=0.000000001, ### msg="col %i original mean: %s not equal to mean after impute: %s" % (i, c['mean'], oc['mean'])) if not h2o_util.approxEqual( oc['mean'], c['mean'], tol=0.000000001): msg = "col %i original mean: %s not equal to mean after impute: %s" % ( i, oc['mean'], c['mean']) print msg