def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets/standard" trialMax = 2 localhost = h2o.decide_if_localhost() for tryHeap in [4, 3, 2, 1]: print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys" if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts( node_count=1, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(2)
def test_rf_allyears2k_oobe(self): importFolderPath = '/home/0xdiag/datasets' csvFilename = 'allyears2k.csv' csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) for trial in range(10): kwargs = paramDict timeoutSecs = 30 + kwargs['ntree'] * 2 start = time.time() # randomize the node node = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix']['classification_error'] rows_skipped = rfView['confusion_matrix']['rows_skipped'] mtry = rfView['mtry'] mtry_nodes = rfView['mtry_nodes'] print "mtry:", mtry print "mtry_nodes:", mtry_nodes self.assertEqual(classification_error, 0, "Should have zero oobe error") self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped") print "Trial #", trial, "completed"
def test_slice(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_B_kmeans_benign(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "benign.csv" key2 = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right? parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_exec_import_hosts_bigfiles(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 4000 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # Update: need unique key names apparently. can't overwrite prior parse output key? # replicating lines means they'll get reparsed. good! (but give new key names) csvFilenameList = [ ("covtype.data", "c"), ("covtype20x.data", "c20"), ("covtype200x.data", "c200"), ("billion_rows.csv.gz", "b"), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename exec_list(exprList, lenNodes, csvFilename, key2)
def test_C_kmeans_prostate(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "prostate.csv" key2 = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_exec_import_hosts(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ["covtype.data"] else: csvFilenameList = [ "covtype200x.data", "covtype200x.data", "covtype.data", "covtype.data", "covtype20x.data", "covtype20x.data", ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "/home/0xdiag/datasets/standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm["GLMModel"] coefficients = GLMModel["coefficients"] validationsList = GLMModel["validations"] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, "err", validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write(".") sys.stdout.flush()
def test_parse_covtype20x_loop(self): csvFilename = "covtype20x.data" importFolderPath = "/home/0xdiag/datasets" trialMax = 2 for tryJvms in [1,2,3,4]: for tryHeap in [1,3]: print "\n", tryHeap,"GB heap,", tryJvms, "jvm per host, import folder,", \ "then loop parsing 'covtype20x.data' to unique keys" h2o_hosts.build_cloud_with_hosts(node_count=tryJvms, java_heap_GB=tryHeap) timeoutSecs=300 for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # h2o removes key after parse now ## print "Removing", parseKey['source_key'] ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key']) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) # sticky ports? h2o.tear_down_cloud() time.sleep(tryJvms * 5)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_B_importFolder_files(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "billion_rows.csv.gz", csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, pollTimeoutSecs=60) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 # RF seems to get memory allocation errors on single machine (16GB dram) ### RFview = h2o_cmd.runRFOnly(trees=1,depth=5,parseKey=parseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") # now some GLm kwargs = {'x': 0, 'y': 1, 'num_cross_validation_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] else: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype20x.data", "cD", 50, 20), ("covtype200x.data", "cE", 50, 200), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_from_import(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex'} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_rf_covtype_fvec(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(jobDispatch) # don't poll for fvec rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) elapsed = time.time() - start print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print h2o.dump_json(rfResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = kwargs['model_key'] rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
def test_from_import(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for trial in range(3): for csvFilename in csvFilenameList: h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) elapsed = time.time() - start print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" print csvFilename, 'H2O reports parse time:', parseKey['response']['time'] # h2o doesn't produce this, but h2o_import.py adds it for us. print "Parse result['source_key']:", parseKey['source_key'] print "Parse result['destination_key']:", parseKey['destination_key'] print "\n" + csvFilename storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # h2o deletes key after parse now ## print "Removing", parseKey['source_key'], "so we can re-import it" ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key']) ## print "removeKeyResult:", h2o.dump_json(removeKeyResult) print "\nTrial", trial, "completed\n"
def test_import_multi_syn_datasets(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' print "This imports a folder of csv files..i.e points to syn_datasets with no regex" print "Doesn't put anything in syn_datasets. When run with import folder redirected" print "to import S3, there is a syn_datasets with 100 files" print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?" timeoutSecs = 500 if h2o.nodes[0].redirect_import_folder_to_s3_path: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*_10000x200*", ] else: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2="syn_datasets.hex", timeoutSecs=500) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ "from all files num_rows:", "{:,}".format(inspect['num_rows']), \ "num_cols:", "{:,}".format(inspect['num_cols']) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() RFview = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # so we can see! h2b.browseJsonHistoryAsUrlLastMatch("RFView") time.sleep(5)
def test_rf_kddcup_1999(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'kddcup_1999.data.gz' print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\ "compared to running with the parameters specified and matching the browser RF query defaults. " +\ "Also run the param for full scoring vs OOBE scoring." parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None,parseKey['destination_key']) for trials in range(4): print "\n" + csvFilename, "Trial #", trials start = time.time() kwargs = { 'response_variable': 'classifier', 'ntree': 200, 'gini': 1, 'class_weights': None, 'stratify': 0, # 'features': None, 'features': 7, 'ignore': None, 'sample': 67, 'bin_limit': 1024, 'depth': 2147483647, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': None, } if trials == 0: kwargs = {} elif trials == 1: kwargs['out_of_bag_error_estimate'] = None elif trials == 2: kwargs['out_of_bag_error_estimate'] = 0 elif trials == 3: kwargs['out_of_bag_error_estimate'] = 1 start = time.time() RFview = h2o_cmd.runRFOnly(trees=50,parseKey=parseKey, timeoutSecs=300, retryDelaySecs=1.0, **kwargs) print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_KMeans_winesPCA(self): if localhost: csvFilenameList = [ #with winesPCA2.csv speciy cols = "1,2" ('winesPCA.csv', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('winesPCA.csv', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = os.path.abspath(h2o.find_file('smalldata')) h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, 'winesPCA.csv', importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'epsilon': 1e-6, 'k': 3 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) print "Expected centers: [-2.276318, -0.965151], with 59 rows." print " [0.0388763, 1.63886039], with 71 rows." print " [2.740469, -1.237816], with 48 rows." model_key = kmeans['destination_key'] kmeansScoreResult = h2o.nodes[0].kmeans_score( key = parseKey['destination_key'], model_key = model_key) score = kmeansScoreResult['score']
def test_GLM_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2, noise=('JStack', None)) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect)
def test_B_importFolder_files(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 1500 csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = {'k': 1 } # 'destination_key': csvFilename + "_" + str(trial) + '.hex'} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansGridOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_RF_poker_311M(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'new-poker-hand.full.311M.txt.gz' for trials in range(2): parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None,parseKey['destination_key']) print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=5,depth=5,parseKey=parseKey, timeoutSecs=600, retryDelaySecs=10.0) print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds'
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def parseFile(self, s3bucket, localbucket, pathname, timeoutSecs, header, **kwargs): if USE_LOCAL: # this can get redirected to s3/s3n by jenkins (importFolderPath, csvFilename) = os.path.split("/" + localbucket + pathname) h2i.setupImportFolder(None, importFolderPath) start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=180) else: schema = "s3n://" bucket = s3bucket URI = schema + bucket + pathname importResult = h2o.nodes[0].import_hdfs(URI) start = time.time() parseKey = h2o.nodes[0].parse("*" + pathname, timeoutSecs=timeoutSecs, header=header) parse_time = time.time() - start h2o.verboseprint("py-S3 parse took {0} sec".format(parse_time)) parseKey['python_call_timer'] = parse_time return parseKey
def test_short(self): csvFilename = 'part-00000b' ### csvFilename = 'short' importFolderPath = '/home/hduser/data' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, separator=9) print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds" print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_vector_filter_factor(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype20x.data", "cC", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # have to import each time, because h2o deletes the source file after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets" trialMax = 2 for tryHeap in [4,3,2,1]: print "\n", tryHeap,"GB heap, 2 jvms, import folder, then loop parsing 'covtype.data' to unique keys" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts(node_count=2, java_heap_GB=tryHeap) h2i.setupImportFolder(None, importFolderPath) for trial in range(trialMax): key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_from_import(self): importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1, depth=25, parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") time.sleep(10) sys.stdout.write(".") sys.stdout.flush()
def test_GLM_covtype_train(self): print "\nMichal will hate me for another file needed: covtype.shuffled.data" importFolderPath = "/home/0xdiag/datasets" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9] + 1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) kwargs = { 'y': 54, 'max_iter': 20, 'n_folds': 0, 'thresholds': 0.5, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', 'case_mode': '=', 'case': 2 } timeoutSecs = 60 for trial in range(10): # always slice from the beginning rowsToUse = rowsForPct[trial % 10] resultKey = "r" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str( rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=dataKeyTest, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glmScore end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / num_rows), "pct. of all rows"
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "/home/0xdiag/datasets/standard" csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on", csvFilename parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class! for trial in range(8,9): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) # hack so the RF will use the sliced result # FIX! don't use the sliced bit..use the whole data for rf training below ### parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial) # kwargs['model_key'] = "model" # double check the rows/cols inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, "going into RF") start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 # double check the rows/cols inspect = h2o_cmd.runInspect(key=dataKeyTest) h2o_cmd.infoFromInspect(inspect, "dataKeyTest") rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfvScoring['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp # return the last rfv done during training return rfv
def test_GLM_100Mx70_hosts(self): # enable this if you need to re-create the file if 1 == 0: SYNDATASETS_DIR = h2o.make_syn_dir() createList = [ (100000000, 70, 'cA', 10000), ] for (rowCount, colCount, key2, timeoutSecs) in createList: csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname SEEDPERFILE = random.randint(0, sys.maxint) write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # Have to copy it to /home/0xdiag/datasets! if localhost: csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv.gz', 500, 'rand_100Mx70.hex'), ] else: # None is okay for key2 csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv.gz', 500, 'rand_100Mx70.hex'), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000, retryDelaySecs=5, initialDelaySecs=10, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) y = num_cols - 1 kwargs = { 'family': 'binomial', 'link': 'logit', 'y': y, 'max_iter': 8, 'n_folds': 0, 'beta_epsilon': 1e-4, 'alpha': 0, 'lambda': 0 } for trial in range(3): start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = { 'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ['YearPredictionMSD.txt'] else: csvFilenameList = ['YearPredictionMSD.txt'] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_poisson_covtype20x(self): if localhost: csvFilenameList = [ ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1==0): print "WARNING: just doing the first 33 features, for comparison to ??? numbers" # pythonic! x = ",".join(map(str,range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3} # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (Elastic) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L1) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def test_rf_covtype_train_oobe3(self): print "\nUse randomFilter to sample the dataset randomly. then slice it" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # FIX! too many digits (10) in the 2nd param seems to cause stack trace execExpr = dataKeyTest + "=randomFilter(" + key2 + "," + str(pct10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=randomFilter(" + key2 + "," + str(rowsForPct[9]) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")" # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def test_GLM_covtype2000x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype2000x.data', 3600, 'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home2/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2) print "parse end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1 == 0): print "WARNING: just first 33 features. Comparison to allstate" # pythonic! x = ",".join(map(str, range(33))) else: x = "" y = "54" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': 8, 'beta_eps': 1e-3 } print "WARNING: max_iter set to 8 for benchmark comparisons" # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) h2o.check_sandbox_for_errors() # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) h2o.check_sandbox_for_errors() # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) h2o.check_sandbox_for_errors()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1 == 1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # for now, take too long on 2x100GB heap on 164 # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/standard' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud( 2, java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts( base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect( parseKey['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_RF_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 10 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_reals_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_B_importFolder_files(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 900 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "billion_rows.csv.gz", csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 # RF seems to get memory allocation errors on single machine (16GB dram) ### RFview = h2o_cmd.runRFOnly(trees=1,depth=5,parseKey=parseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") # now some GLm kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def test_four_billion_rows(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/billions" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 1500 csvFilenameAll = [ "four_billion_rows.csv", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() # Parse********************************* parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] value_size_bytes = inspect['value_size_bytes'] row_size = inspect['row_size'] print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary(key=parseKey['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4 * 1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'epsilon': 1e-6, 'max_iter': 20, 'cols': None, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1 } # one coefficient is checked a little more colX = 0 # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
def test_GLM_mnist(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_parse_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() # just do the import folder once importFolderPath = "/home/0xdiag/datasets/libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("covtype.binary.svm", "cC", 30, 1, 2, True, True), ("mnist_train.svm", "cM", 30, 0, 9, False, False), # multi-label target like 1,2,5 ..not sure what that means # ("tmc2007_train.svm", "cJ", 30, 0, 21.0, False, False), # illegal non-ascending cols # ("syn_6_1000_10.svm", "cK", 30, -36, 36, True, False), # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False), # fails csvDownload ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False), ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False), ("news20.svm", "cH", 30, 1, 20, False, False), ("connect4.svm", "cB", 30, -1, 1, False, False), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False) ("gisette_scale.svm", "cF", 30, -1, 1, False, False), ("mushrooms.svm", "cG", 30, 1, 2, False, False), ] ### csvFilenameList = random.sample(csvFilenameAll,1) ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, expectedCol0Min, expectedCol0Max, enableDownloadReparse, enableSizeChecks) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvPathname, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # INSPECT****************************************** start = time.time() inspectFirst = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspectFirst, csvFilename) # look at the min/max for the target col (0) and compare to expected for the dataset imin = inspectFirst['cols'][0]['min'] imax = inspectFirst['cols'][0]['max'] if expectedCol0Min: self.assertEqual( imin, expectedCol0Min, msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min)) if expectedCol0Max: self.assertEqual( imax, expectedCol0Max, msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max)) print "\nmin/max for col0:", imin, imax # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone if DO_SUMMARY: goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseKey['destination_key'], timeoutSecs=300, noPrint=True) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) if DO_DOWNLOAD_REPARSE and enableDownloadReparse: missingValuesListA = h2o_cmd.infoFromInspect( inspectFirst, csvPathname) num_colsA = inspectFirst['num_cols'] num_rowsA = inspectFirst['num_rows'] row_sizeA = inspectFirst['row_size'] value_size_bytesA = inspectFirst['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv" print "Trying csvDownload of", csvDownloadPathname h2o.nodes[0].csv_download(key=key2, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o # don't have to now. we use a new name for key2B # h2o.nodes[0].remove_key(key2) start = time.time() key2B = key2 + "_B" parseKeyB = h2o_cmd.parseFile(csvPathname=csvDownloadPathname, key2=key2B) print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=key2B) missingValuesListB = h2o_cmd.infoFromInspect( inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True) print "df.difference:", h2o.dump_json(df.difference) for i, d in enumerate(df.difference): # ignore mismatches in these # "variance" # "response.time" # "key" if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d: pass else: raise Exception( "testing %s, found unexpected mismatch in df.difference[%d]: %s" % (csvPathname, i, d)) if DO_SIZE_CHECKS and enableSizeChecks: # if we're allowed to do size checks. ccompare the full json response! print "Comparing original inspect to the inspect after parsing the downloaded csv" # vice_versa=True self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference))) # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen # make the check conditional based on the dataset self.assertEqual( row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) self.assertEqual( value_size_bytesA, value_size_bytesB, "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) print "missingValuesListA:", missingValuesListA print "missingValuesListB:", missingValuesListB self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result %d %d" % (num_colsA, num_colsB)) self.assertEqual( num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result %d %d" % (num_rowsA, num_rowsB)) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_KMeans_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "/datasets/kmeans_big" csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. if FROM_HDFS: importFolderResult = h2i.setupImportHdfs( None, importFolderPath) else: importFolderResult = h2i.setupImportFolder( None, importFolderPath) # PARSE **************************************** print "Parse starting: " + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseKey = h2i.parseImportHdfsFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # KMeans **************************************** print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'initialization': 'Furthest', 'epsilon': 1e-6, 'cols': None, 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)