def test_parse_rand_utf8(self): SYNDATASETS_DIR = h2o.make_syn_dir() print "HACK: reduce rows to 10 for debug" tryList = [ # do two cols to detect bad eol behavior (10, 2, 'cA', 120), (10, 2, 'cG', 120), (10, 2, 'cH', 120), ] print "What about messages to log (INFO) about unmatched quotes (before eol)" # got this ..trying to avoid for now # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "parseResult:", dump_json(parseResult) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # FIX! check type? # print "inspect:", h2o.dump_json(inspect) self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount)) self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
def test_parse_1m_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [(10, 65000, "cH", 30)] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() print "Summary should work with 65k" parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) # should match # of cols in header or ?? self.assertEqual( inspect["num_cols"], colCount, "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount), ) self.assertEqual( inspect["num_rows"], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect["num_rows"], rowCount), ) # we should obey max_column_display column_limits = [25, 25000, 50000] for column_limit in column_limits: inspect = h2o_cmd.runInspect( None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs ) self.assertEqual( len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit) ) for r in range(0, len(inspect["rows"])): # NB: +1 below because each row includes a row header row: #{row} self.assertEqual( len(inspect["rows"][r]), column_limit + 1, "inspect data rows obeys max_column_display = " + str(column_limit), )
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange (10,100,10): shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange (10,100,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_sort_of_prostate_with_row_schmoo(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = rand_rowData() write_syn_dataset(csvPathname, 1, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" for trial in range (100): rowData = rand_rowData() num = random.randint(1, 10096) append_syn_dataset(csvPathname, rowData, num) start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, timeoutSecs=70, pollTimeoutSecs=60) print "trial #", trial, "with num rows:", num, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=key2) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_big_sum_fail(self): node = h2o.nodes[0] SYNDATASETS_DIR = h2o.make_syn_dir() csvPathname = SYNDATASETS_DIR + '/temp.csv' hex_key = 'temp.hex' for trial in range(5): # what about seed? cfResult = h2o.nodes[0].create_frame(key=hex_key, binary_ones_fraction=0.02, binary_fraction=0, randomize=1, missing_fraction=0, integer_fraction=1, real_range=100, has_response=0, response_factors=2, factors=100, cols=1, integer_range=100, value=0, categorical_fraction=0, rows=2.5e+08, timeoutSecs=300) inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect, hex_key) if UNNECESSARY: # this is just doing a head to R. not critical h2e.exec_expr(execExpr="%s = %s" % (hex_key, hex_key)) h2e.exec_expr(execExpr="Last.value.0 = %s[c(1,2,3,4,5,6),]" % hex_key) h2e.exec_expr(execExpr="Last.value.0 = Last.value.0") node.csv_download(src_key="Last.value.0", csvPathname=csvPathname) node.remove_key("Last.value.0") # not sure why this happened h2o_cmd.runStoreView(view=10000, offset=0) # Fails on this h2e.exec_expr(execExpr='Last.value.1 = %s[,1]' % hex_key) print "Trial #", trial, "completed"
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_C_RF_poker100(self): parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10) SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 60 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10
def test_factor_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() # use SEED so the file isn't cached? csvFilenameAll = [ ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 1mx8 csv" write_syn_dataset(csvPathname, 1000000, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_exec2_cbind_like_R(self): SYNDATASETS_DIR = h2o.make_syn_dir() SEEDPERFILE = random.randint(0, sys.maxint) rowCount = 30000 colCount = 150 timeoutSecs = 60 hex_key = "df" csvPathname = SYNDATASETS_DIR + "/" + "df.csv" write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) colCount = 1 hex_key = "indx" csvPathname = SYNDATASETS_DIR + "/" + "indx.csv" write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] for trial in range(10): for execExpr in exprList: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' h2o.check_sandbox_for_errors()
def test_parse_bad_30rows_fvec(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() csvPathname = SYNDATASETS_DIR + "/bad.data" dsf = open(csvPathname, "w+") dsf.write(datalines) dsf.close() for i in range(20): # every other one single_quotes = 1 # force header=1 to make it not fail (doesn't deduce correctly) parseResult = h2i.import_parse( path=csvPathname, schema="put", single_quotes=single_quotes, header=1, hex_key="trial" + str(i) + ".hex" ) inspect = h2o_cmd.runInspect(key=parseResult["destination_key"]) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) numRows = inspect["numRows"] numCols = inspect["numCols"] self.assertEqual(numCols, 4, "Parsed wrong number of cols: %s" % numCols) self.assertNotEqual( numRows, 30, "Parsed wrong number of rows. Should be 29.\ Didn't deduce header?: %s" % numRows, ) self.assertEqual(numRows, 29, "Parsed wrong number of rows: %s" % numRows)
def test_parse_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 60), (100, 6000, 'cB', 60), (100, 7000, 'cC', 60), (100, 8000, 'cD', 60), (100, 8200, 'cE', 60), (100, 8500, 'cF', 60), (100, 9000, 'cG', 60), (100, 10000, 'cI', 60), (100, 11000, 'cH', 60), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_many_cols_01(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 5), (100, 10000, 'cI', 5), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=120, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=120) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = rand_rowData() totalRows = 1000000 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" # used to fail around 50 iterations..python memory problem for trial in range (40): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, rowData, num) totalRows += num start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=150, pollTimeoutSecs=150) print "trial #", trial, "totalRows:", totalRows, "num:", num, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_rf_1ktrees_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [500]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_rf_float_rand2_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10000 write_syn_dataset(csvPathname, totalRows, headerData) for trial in range (5): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, num) totalRows += num start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? kwargs = {'ntrees': 5, 'max_depth': 5} parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_KMeans_constant_col_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 10), (100, 10, 'cB', 10), (100, 9, 'cC', 10), (100, 8, 'cD', 10), (100, 7, 'cE', 10), (100, 6, 'cF', 10), (100, 5, 'cG', 10), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print "Generate synthetic dataset with first column constant = 0 and see what KMeans does" cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") print "Parse result['destination_key']:", parseResult['destination_key'] kwargs = {'k': 2, 'initialization': 'Furthest', 'destination_key': 'benign_k.hex', 'max_iter': 25} kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # check center list (first center) has same number of cols as source data self.assertEqual(colCount, len(centers[0]), "kmeans first center doesn't have same # of values as dataset row %s %s" % (colCount, len(centers[0])))
def test_many_rows_long_enums(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 1, 'cA', 5), (1000000, 1, 'cA', 5), ] # h2b.browseTheCloud() cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) SEPARATOR = ord(',') parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=300, header=0, separator=SEPARATOR) # don't force header..we have NAs in the rows, and NAs mess up headers print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename
def test_many_cols_and_values_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 10, 'cA', 30), (100, 1000, 'cB', 30), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: for sel in range(48): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cI', 5), (100, 5000, 'cA', 5), (100, 6000, 'cB', 5), (100, 7000, 'cC', 5), (100, 8000, 'cD', 5), (100, 8200, 'cE', 5), (100, 8500, 'cF', 5), (100, 9000, 'cG', 5), (100, 11000, 'cH', 5), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_1ktrees_job_cancel_many_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range (1,5): # random 0 or 1 delay delay = random.uniform(0,1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = ( "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR ) h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 3): sys.stdout.write(".") sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30 ) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds" print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = "1,0,65,1,2,1,1.4,0,6" write_syn_dataset(csvPathname, 99860, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and key2 names for each trial" for trial in range (200): append_syn_dataset(csvPathname, rowData) ### start = time.time() # this was useful to cause failures early on. Not needed eventually ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv")) ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds' start = time.time() key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2) print "trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=key2) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_parse_specific_case3(self): SYNDATASETS_DIR = h2o.make_syn_dir() hex_key = "a.hex" for (dataset, expNumRows, expNumCols, expNaCnt, expType, unicodeNum) in tryList: csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, dataset) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60) print "Parsed with special unichr(%s) which is %s:" % (unicodeNum, unichr(unicodeNum)) print "inspect:", h2o.dump_json(inspect) numRows = inspect['numRows'] self.assertEqual(numRows, expNumRows, msg='Using unichr(0x%x) Wrong numRows: %s Expected: %s' % \ (unicodeNum, numRows, expNumRows)) numCols = inspect['numCols'] self.assertEqual(numCols, expNumCols, msg='Using unichr(0x%x) Wrong numCols: %s Expected: %s' % \ (unicodeNum, numCols, expNumCols)) # this is required for the test setup assert(len(expNaCnt)>=expNumCols) assert(len(expType)>=expNumCols) for k in range(expNumCols): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(expNaCnt[k], naCnt, msg='Using unichr(0x%x) col: %s naCnt: %d should be: %s' % \ (unicodeNum, k, naCnt, expNaCnt[k])) stype = inspect['cols'][k]['type'] self.assertEqual(expType[k], stype, msg='Using unichr(0x%x) col: %s type: %s should be: %s' % \ (unicodeNum, k, stype, expType[k]))
def test_many_cols_and_types(self): SEED = random.randint(0, sys.maxint) print "\nUsing random seed:", SEED # SEED = random.seed(SEED) SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5, "cA", 5), (1000, 59, "cB", 5), (5000, 128, "cC", 5), (6000, 507, "cD", 5), (9000, 663, "cE", 5), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvFilename
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex'} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # cluster centers can return in any order centersSorted = sorted(centers, key=itemgetter(0)) self.assertAlmostEqual(centersSorted[0][0],100,delta=.2) self.assertAlmostEqual(centersSorted[1][0],200,delta=.2) self.assertAlmostEqual(centersSorted[2][0],300,delta=.2) self.assertAlmostEqual(centersSorted[0][1],100,delta=.2) self.assertAlmostEqual(centersSorted[1][1],200,delta=.2) self.assertAlmostEqual(centersSorted[2][1],300,delta=.2) self.assertAlmostEqual(centersSorted[0][2],100,delta=.2) self.assertAlmostEqual(centersSorted[1][2],200,delta=.2) self.assertAlmostEqual(centersSorted[2][2],300,delta=.2) show_results(csvPathname, parseKey, model_key, centers, 'd')
def test_parse_bad_30rows_fvec(self): # h2b.browseTheCloud() h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvPathname = SYNDATASETS_DIR + '/bad.data' dsf = open(csvPathname, "w+") dsf.write(datalines) dsf.close() for i in range(20): # every other one single_quotes = 1 parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes, hex_key="trial" + str(i) + ".hex") inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 4, "Parsed wrong number of cols: %s" % numCols) self.assertNotEqual(numRows, 30, "Parsed wrong number of rows. Should be 29.\ Didn't deduce header?: %s" % numRows) self.assertEqual(numRows, 29, "Parsed wrong number of rows: %s" % numRows)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_parse_fs_schmoo_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" # rowData = "1,0,65,1,2,1,1.4,0,6" rowData = "1,0,65,1,2,1,1,0,6" totalRows = 99860 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and hex_key names for each trial" for trial in range (200): append_syn_dataset(csvPathname, rowData) totalRows += 1 start = time.time() key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=hex_key) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # what if we do another node? # FIX! do we need or want a random delay here? h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10 sys.stdout.write('.') sys.stdout.flush()
def test_parse_csv_download_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = rand_rowData() totalRows = 1000000 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" # failed around 50 trials..python memory problem for trial in range(5): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, rowData, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname) numColsA = inspect['numCols'] numRowsA = inspect['numRows'] byteSizeA = inspect['byteSize'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname) numColsB = inspect['numCols'] numRowsB = inspect['numRows'] byteSizeB = inspect['byteSize'] self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result") self.assertEqual( numRowsA, numRowsB, "numRows mismatches after re-parse of downloadCsv result") # self.assertEqual(byteSizeA, byteSizeB, # "byteSize mismatches after re-parse of downloadCsv result %s %s" % (byteSizeA, byteSizeB)) h2o.check_sandbox_for_errors()
def test_rf_float_rand_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 1000 colCount = 7 write_syn_dataset(csvPathname, totalRows, colCount, headerData) for trial in range(5): # grow the data set rowData = rand_rowData(colCount) num = random.randint(4096, 10096) append_syn_dataset(csvPathname, colCount, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) inspect = h2o_cmd.runInspect(key=hex_key) cols = inspect['cols'] numCols = inspect['numCols'] for i, c in enumerate(cols): if i < ( numCols - 1 ): # everything except the last col (output) should be 8 byte float colType = c['type'] self.assertEqual(colType, 'Real', msg="col %d should be type Real: %s" % (i, colType)) ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_summary2_percentile2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname legalValues = {} for x in range(expectedMin, expectedMax): legalValues[x] = x write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1) if h2o.verbose: print "summaryResult:", h2o.dump_json(summaryResult) summaries = summaryResult['summaries'] scipyCol = 0 for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: e = .1 * rowCount self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) print "pctile:", pctile print "maxs:", maxs self.assertEqual(maxs[0], expectedMax) print "mins:", mins self.assertEqual(mins[0], expectedMin) for v in pctile: self.assertTrue(v >= expectedMin, "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin)) self.assertTrue(v <= expectedMax, "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax)) eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0] if expectedMin==1: eV = eV1 elif expectedMin==0: eV = [e-1 for e in eV1] elif expectedMin==2: eV = [e+1 for e in eV1] else: raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin) trial += 1 # if colname!='' and expected[scipyCol]: if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) scipyCol += 1
def test_GLM2_mnist(self): if not SCIPY_INSTALLED: pass else: h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilelist = [ (10000, 500, 'cA', 60), ] trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvFilename = 'syn_' + "binary" + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, rowCount, colCount) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** modelKey = 'GLM_model' y = colCount kwargs = { 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0, 'max_iter': 15, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } # GLM wants the output col to be strictly 0,1 integer execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % ( hex_key, y + 1, y + 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'aHack'} timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? lambdaMax = glm['glm_model']['lambda_max'] print "lambdaMax:", lambdaMax best_threshold = glm['glm_model']['submodels'][0][ 'validation']['best_threshold'] print "best_threshold", best_threshold # pick the middle one? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][ '_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above print "\nPredict\n==========\n" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='aHack', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='aHack', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 50, "Should see less than 50% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_parse_time(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = None colCount = 6 rowCount = 1000 write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range(20): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname) print "missingValuesListA", missingValuesListA num_colsA = inspect['num_cols'] num_rowsA = inspect['num_rows'] row_sizeA = inspect['row_size'] value_size_bytesA = inspect['value_size_bytes'] self.assertEqual(missingValuesListA, [], "missingValuesList should be empty") self.assertEqual(num_colsA, colCount) self.assertEqual(num_rowsA, rowCount) # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) # interesting. what happens when we do csv download with time data? start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname) print "missingValuesListB", missingValuesListB num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # so I guess that's okay. So allow for an extra row here. self.assertEqual( num_rowsA, num_rowsB, "num_rowsA: %s num_rowsB: %s mismatch after re-parse of downloadCsv result" % (num_rowsA, num_rowsB)) print "H2O writes the internal format (number) out for time." print "So don't do the row_size and value_size comparisons." # ==> syn_time.csv <== # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # ==> csvDownload.csv <== # "0","1","2","3","4","5" # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12 if 1 == 0: # extra line for column headers? self.assertEqual( row_sizeA, row_sizeB, "row_size wrong after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) self.assertEqual( value_size_bytesA, value_size_bytesB, "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) # FIX! should do some comparison of values? # maybe can use exec to checksum the columns and compare column list. # or compare to expected values? (what are the expected values for the number for time inside h2o?) # FIX! should compare the results of the two parses. The infoFromInspect result? ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_kmeans_predict3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 1: outputClasses = 3 y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # Huh...now we apparently need the translate. Used to be: # No translate because we're using an Exec to get the data out?, and that loses the encoding? # translate = None translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1 == 0: outputClasses = 6 y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: outputClasses = 6 y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: outputClasses = 6 y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 else: outputClasses = 10 y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, predictHexKey, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like # FIX! apparently we lose the enum mapping when we slice out, and then csv download? we just get the number? # OH NO..it looks like we actually preserve the enum..it's in the csv downloaded # the prediction is the one that doesn't have it, because it's realated to clusters, which have no # notion of output classes h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predictResult = h2o.nodes[0].generate_predictions( model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time( ) - start, 'seconds' print "predictResult:", h2o.dump_json(predictResult) h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col( csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col( csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? # hack..need to fix this if 1 == 0: if pctWrong > 2.0: raise Exception( "pctWrong too high. Expect < 2% error because it's reusing training data" ) return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect["numCols"] numRows = inspect["numRows"] seed = random.randint(0, sys.maxint) # should pass seed # want to ignore the response col? we compare that to predicted # if we tell kmeans to ignore a column here, and then use the model on the same dataset to predict # does the column get ignored? (this is last col, trickier if first col. (are the centers "right" kwargs = { 'ignored_cols_by_name': response, 'seed': seed, # "seed": 4294494033083512223, 'k': outputClasses, 'initialization': 'PlusPlus', 'destination_key': 'kmeans_model', 'max_iter': 1000 } kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=60, **kwargs) # this is what the size of each cluster was, when reported by training size = kmeans['model']['size'] # tupleResultList is created like this: ( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) ) # THIS DOES A PREDICT in it (we used to have to do the predict to get more training result info?) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) # the tupleResultList has the size during predict? compare it to the sizes during training # I assume they're in the same order. size2 = [t[1] for t in tupleResultList] if size != size2: raise Exception( "training cluster sizes: %s are not the same as what we got from predict on same data: %s", (size, size2)) # hack...hardwire for iris here # keep this with sizes sorted expectedSizes = [ [39, 50, 61], [38, 50, 62], ] sortedSize = sorted(size) if sortedSize not in expectedSizes: raise Exception( "I got cluster sizes %s but expected one of these: %s " % (sortedSize, expectedSizes)) # check center list (first center) has same number of cols as source data print "centers:", centers # we said to ignore the output so subtract one from expected self.assertEqual( numCols - 1, len(centers[0]), "kmeans first center doesn't have same # of values as dataset row %s %s" % (numCols - 1, len(centers[0]))) # FIX! add expected # h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) error = kmeans['model']['total_within_SS'] within_cluster_variances = kmeans['model']['within_cluster_variances'] print "within_cluster_variances:", within_cluster_variances print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y # zero-based index matches response col name print "" print "oh I see why I can't compare predict to actual, in kmeans" print "the cluster order doesn't have to match the output class enum order" print "so I don't know what cluster, each output class will be (kmeans)" print "all I can say is that the prediction distribution should match the original source distribution" print "have to figure out what to do" predictHexKey = 'predict_0.hex' pctWrong = predict_and_compare_csvs(model_key='kmeans_model', hex_key=hexKey, predictHexKey=predictHexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now # HACK ignoring error for now if 1 == 0: self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=0.7, msg= "predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def setUpClass(cls): h2o.init() global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir()
def test_parse_200k_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 200, 200), (10, 1000, 'cB', 200, 200), (10, 1000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # import it N times and compare the N hex keys REPEAT = 5 for i in range(REPEAT): hex_key_i = hex_key + "_" + str(i) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key_i, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) # compare each to 0 for i in range(1, REPEAT): hex_key_i = hex_key + "_" + str(i) hex_key_0 = hex_key + "_0" print "\nComparing %s to %s" % (hex_key_i, hex_key_0) if 1 == 0: execExpr = "%s[1,]+%s[1,]" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s[,1]+%s[,1]" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s+%s" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s!=%s" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s==%s" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "sum(%s==%s)" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "s=sum(%s==%s)" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "s=c(1); s=sum(%s==%s)" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "n=c(1); n=nrow(%s)*ncol(%s))" % (hex_key, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "r=c(1); r=s==n" resultExec, result, h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "result:", result
def test_GLM_convergence_2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 1, 'cD', 300), # (100, 100, 'cE', 300), # (100, 200, 'cF', 300), # (100, 300, 'cG', 300), # (100, 400, 'cH', 300), # (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) USEKNOWNFAILURE = False for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) if USEKNOWNFAILURE: csvFilename = 'failtoconverge_100x50.csv' csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put') print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 40, 'lambda': 1e0, 'alpha': 0.5, 'weight': 1.0, 'link': 'familyDefault', 'n_folds': 0, 'beta_epsilon': 1e-4, 'thresholds': '0:1:0.01', } if USEKNOWNFAILURE: kwargs['y'] = 50 else: kwargs['y'] = y emsg = None for i in range(3): start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time( ) - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, allowFailWarning=True, **kwargs) if 1 == 0: print "\n", "\ncoefficients in col order:" # since we're loading the x50 file all the time..the real colCount # should be 50 (0 to 49) if USEKNOWNFAILURE: showCols = 50 else: showCols = colCount for c in range(showCols): print "%s:\t%s" % (c, coefficients[c]) print "intercept:\t", intercept # gets the failed to converge, here, after we see it in the browser too x = re.compile("[Ff]ailed") if warnings: print "warnings:", warnings for w in warnings: print "w:", w if (re.search(x, w)): # first if emsg is None: emsg = w print w if emsg: break if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def test_kmeans_sphere5(self): SYNDATASETS_DIR = h2o.make_syn_dir() CLUSTERS = 5 SPHERE_PTS = 10000 csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # try 5 times, to see if all inits by h2o are good for trial in range(5): kwargs = { 'k': CLUSTERS, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'syn_spheres100.hex' } timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] clustersSorted = sorted(clusters, key=itemgetter(0)) ### print clustersSorted print "\nh2o result, centers sorted" print clustersSorted print "\ngenerated centers" print centersList for i, center in enumerate(centersList): a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str, a)) bStr = ",".join(map(str, b)) iStr = str(i) self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " x not correct.") self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " y not correct.") self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " z not correct.") print "Trial #", trial, "completed"
def test_rf_log_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 100, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult[ 'destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult[ 'destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # do oobe kwargs['response'] = "C" + str(colCount + 1) rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) oobeTrainPctRight = 100.0 - classification_error expectTrainPctRight = 94 self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\ msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5) # RF score****************************************************** print "Now score with the 2nd random dataset" rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) self.assertAlmostEqual( classification_error, 5.0, delta=2.0, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100.0 - classification_error expectScorePctRight = 94 self.assertAlmostEqual( fullScorePctRight, expectScorePctRight, msg="Full: pct. right for scoring not close enough %6.2f %6.2f" % (fullScorePctRight, expectScorePctRight), delta=5)
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print(rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** h2o.beta_features = False #turn off beta_features start = time.time() #h2o.beta_features = False modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseResult['destination_key'] for h2o" parseResult['destination_key'] = trainKey elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # PCA(tolerance iterate)**************************************** h2o.beta_features = True for tolerance in [i / 10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } kwargs = params.copy() h2o.beta_features = True PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) pcaView = h2o_cmd.runPCAView(modelKey=modelKey) h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) #h2o.beta_features = True pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print #h2o.beta_features=False print print print num_pc = pcaInspect['pca_model']['num_pc'] print "The number of standard deviations obtained: ", num_pc print print print if DO_PCA_SCORE: # just score with same data score_params = { 'destination_key': scoreKey, 'model': modelKey, 'num_pc': num_pc, 'source': hex_key, } kwargs = score_params.copy() pcaScoreResult = h2o.nodes[0].pca_score( timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) print "PCAScore completed in", pcaScoreResult[ 'python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaScoreResult[ 'python_%timeout'], "% of the timeout" # Logging to a benchmark file algo = "PCAScore " + " num_pc=" + str( score_params['num_pc']) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaScoreResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l)
def test_rf_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() REPL = 3 tryList = [ # summary fails with 100000 cols # (10, 50, 2, 'cA', 600), # pass # (2, 50, 50, 'cA', 600), # (2, 100, 50, 'cA', 600), (REPL, 200, 50, 'cA', 600), (REPL, 300, 50, 'cA', 600), (REPL, 350, 50, 'cA', 600), (REPL, 375, 50, 'cB', 600), # fail (REPL, 500, 300, 'cC', 600), (REPL, 500, 400, 'cD', 600), (REPL, 500, 500, 'cE', 600), (10, 50, 1600, 'cF', 600), (10, 50, 3200, 'cG', 600), (10, 50, 5000, 'cH', 600), # at 6000, it gets connection reset on the parse on ec2 # (6000, 50, 5000, 'cG', 600), # (7000, 50, 5000, 'cH', 600), ] ### h2b.browseTheCloud() paramDict = { 'ntrees': 10, 'destination_key': 'model_keyA', 'max_depth': 10, 'nbins': 100, 'sample_rate': 0.80, } trial = 0 for (FILEREPL, rowCount, colCount, hex_key, timeoutSecs) in tryList: trial += 1 SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs) if DO_RF: paramDict['response'] = 'C' + str(colCount) paramDict['mtries'] = 2 paramDict['seed'] = random.randint(0, sys.maxint) kwargs = paramDict.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", parseResult['destination_key'], 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) algo = "RF " l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs. trees: {:d} Error: {:6.2f} \ numRows: {:d} numCols: {:d} byteSize: {:d}' .format( len(h2o.nodes), tryHeap, algo, parseResult['destination_key'], elapsed, kwargs['ntrees'], \ classification_error, parseResult['numRows'], parseResult['numCols'], parseResult['byteSize']) print l h2o.cloudPerfH2O.message(l) print "Trial #", trial, "completed"
def test_summary2_uniform_int_w_NA(self): h2o.beta_features = False SYNDATASETS_DIR = h2o.make_syn_dir() M = 100 tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'B.hex', 1, 1000 * M, ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)), (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0, 1000.0)), (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0, 20000.0)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0, -1250.0, 0)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0, 50000.0, 100000.0)), # (ROWS, 1, 'A.hex', 1, 101, ('C1', 1.0, 26.00, 51.00, 76.00, 101.0)), # (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -49.0, 0, 49.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0, 100.0)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0)), # (ROWS, 1, 'C.hex', -101, 101, ('C1', -101, -51, -1, 49.0, 100.0)), ] if not DO_REAL: # only 3 integer values! tryList.append(\ (1000000, 1, 'x.hex', -1, 1, ('C1', -1.0, -1, 0.000, 1, 1.00)) \ ) timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_parse_rand_enum_compress(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 1000000 # from command line arg -long if h2o.long_test_case: repeat = 1000 scale = 10 # scale up the # of rows tryList = [ (n*scale, 1, 'cI', 300), (n*scale, 1, 'cI', 300), (n*scale, 1, 'cI', 300), ] else: repeat = 1 scale = 1 tryList = [ (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] lastcolsHistory = [] enumList = create_enum_list(listSize=ENUMS_NUM) for r in range(repeat): SEED_PER_FILE = random.randint(0, sys.maxint) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) # why are we saving this? lastcolsHistory.append(lastcols) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=30, separator=colSepInt, doSummary=DO_SUMMARY) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] h2o_cmd.infoFromInspect(inspect) # Each column should get .10 random NAs per iteration. Within 10%? missingValuesList = h2o_cmd.infoFromInspect(inspect) # print "missingValuesList", missingValuesList # for mv in missingValuesList: # self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, # msg='mv %s is not approx. expected %s' % (mv, expectedNA)) self.assertEqual(rowCount, numRows) self.assertEqual(colCount, numCols) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=DISABLE_ALL_NA)
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0, 1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0, 1) GZIP_HEADER = random.randint(0, 1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k, v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0, 1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0, 1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename( csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename( hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1 == 0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = { 'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf } start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_GLM2_many_enums(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # just randomly pick the row and col cases. colSepCase = random.randint(0, 1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase == 0): colSepHexString = '01' quoteChars = ",\'\"" # more choices for the unquoted string else: colSepHexString = '2c' # comma quoteChars = "" colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0, 1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase == 0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'response': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() ### glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds'
def test_parse_utf8_3(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 10000 n = 1000 n = 500 # from command line arg -long if h2o.long_test_case: repeat = 1000 else: repeat = 50 scale = 1 tryList = [ (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] for r in range(repeat): for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() write_syn_dataset(csvPathname, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEEDPERFILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=60, separator=colSepInt, doSummary=DO_SUMMARY) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect( key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] h2o_cmd.infoFromInspect(inspect) # Each column should get .10 random NAs per iteration. Within 10%? missingValuesList = h2o_cmd.infoFromInspect(inspect) # print "missingValuesList", missingValuesList # for mv in missingValuesList: # self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, # msg='mv %s is not approx. expected %s' % (mv, expectedNA)) # might have extra rows if numRows != rowCount: raise Exception("Expect numRows %s = rowCount %s because guaranteed not to have extra eols" % \ (numRows, rowCount)) # numCols should be right? self.assertEqual(colCount, numCols) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)
def test_GBM_many_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: tryList = [ (10000, 100, 'cA', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, 'cD', 300), (10000, 200, 'cE', 300), (10000, 300, 'cF', 300), (10000, 400, 'cG', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) # hack elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** ntrees = 5 prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] # for max_depth in [5,10,20,40]: for max_depth in [5, 10, 20]: # PARSE a new header**************************************** print "Creating new header", hdrPathname prefix = prefixList.pop(0) write_syn_header(hdrPathname, rowCount, colCount, prefix) # upload and parse the header to a hex hdr_hex_key = prefix + "_hdr.hex" parseHdrResult = h2i.import_parse( bucket=None, path=hdrPathname, schema='put', header=1, # REQUIRED! otherwise will interpret as enums hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, doSummary=False) # Set Column Names (before autoframe is created) h2o.nodes[0].set_column_names(source=hex_key, copy_from=hdr_hex_key) # GBM print "response col name is changing each iteration: parsing a new header" params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': prefix + "_response", 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str( max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # works if you delete the autoframe ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe') # just plot the last one if DO_PLOT: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # H2O might not do whitespace stripping on numbers correctly, when , is {SEP} # GLM will auto expand categoricals..so if we have more coefficients than expected # that means it didn't parse right # mix in space/tab combos # just done like this for readability rowDataTrueRaw = \ "<sp>1,\ 0<sp>,\ <tab>65,\ 1<tab>,\ <sp><tab>2,\ 1<sp><tab>,\ <tab><sp>1,\ 4<tab><sp>,\ <tab><tab>1,\ 4<tab><tab>,\ <sp><sp>1,\ 4<sp><sp>" rowDataTrue = re.sub("<sp>"," ", rowDataTrueRaw) rowDataTrue = re.sub("<tab>"," ", rowDataTrue) rowDataFalse = \ "0,\ 1,\ 0,\ -1,\ -2,\ -1,\ -1,\ -4,\ -1,\ -4,\ -1,\ -3" twoValueList = [ # (0,1,0, 12), # (0,1,1, 12), # ('A','B',0, 12), # ('A','B',1, 12), (-1,1,-1, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, expectedCoeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) hex_key = csvFilename + "_" + str(trial) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) # maybe go back to simpler exec here. this was from when Exec failed unless this was used execExpr="A.hex=%s" % hex_key h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (13, 13, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'A.hex'} start = time.time() kwargs = { 'n_folds': 0, 'response': 'C13', 'family': 'binomial', 'alpha': 0.0, 'lambda': 0, 'beta_epsilon': 0.0002 } # default takes 39 iterations? play with alpha/beta print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) glm = h2o_cmd.runGLM(parseResult=aHack, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # check that the number of entries in coefficients is right (12 with intercept) coefficients_names = glm['glm_model']['coefficients_names'] print "coefficients_names:", coefficients_names # subtract one for intercept actualCoeffNum = len(glm['glm_model']['submodels'][0]['beta']) - 1 if (actualCoeffNum!=expectedCoeffNum): raise Exception("Should be %s expected coefficients in result. actual: %s" % (expectedCoeffNum, actualCoeffNum)) print "trial #", trial, "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() trial += 1
def test_NN_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A', 'B', 0, 14), ('A', 'B', 1, 14), (0, 1, 0, 12), (0, 1, 1, 12), (0, 1, 'NaN', 12), (1, 0, 'NaN', 12), (-1, 1, 0, 12), (-1, 1, 1, 12), (-1e1, 1e1, 1e1, 12), (-1e1, 1e1, -1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols': None, 'response': 'C' + str(response), 'classification': 1, 'mode': 'SingleThread', 'activation': 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden': '500', 'rate': 0.01, 'rate_annealing': 1e-6, 'momentum_start': 0, 'momentum_ramp': 0, 'momentum_stable': 0, 'l1': 0.0, 'l2': 1e-4, 'seed': 80023842348, 'loss': 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 1.0, 'destination_key': model_key, 'validation': hex_key, } timeoutSecs = 60 start = time.time() h2o.beta_features = True h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "NN end on ", csvFilename, ' took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.0 relTol = 0.01 kwargs = { 'source': hex_key, 'max_rows': 0, 'response': 'C' + str(response), 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score' + str(trial) + '.hex', 'model': model_key } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.check_sandbox_for_errors() trial += 1
def setUpClass(cls): global SEED SEED = h2o.setup_random_seed() h2o.init() global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir()
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-8 }, { 'alpha': 0.5, 'lambda': 0.0 }, { 'alpha': 0.0, 'lambda': 0.0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iterations > 20: raise Exception( "Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def test_GLM_many_cols_tridist(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 20, 'cB', 300), (10000, 30, 'cC', 300), (10000, 40, 'cD', 300), (10000, 50, 'cE', 300), (10000, 60, 'cF', 300), (10000, 70, 'cG', 300), (10000, 80, 'cH', 300), (10000, 90, 'cI', 300), (10000, 100, 'cJ', 300), (10000, 200, 'cK', 300), (10000, 300, 'cL', 300), (10000, 400, 'cM', 300), (10000, 500, 'cN', 300), (10000, 600, 'cO', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "\nParse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] y = colCount kwargs = {'y': y} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GLM_enums_unbalanced(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y+1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.5, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 30: raise Exception("Why take so many iterations: %s in this glm2 training?" % iteration) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y + 1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.5, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 0 }, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5, errorIfCancelled=True) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) print "glm2 end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] if not validation or 'avg_err' not in validation: raise Exception("glm: %s" % h2o.dump_json(glm) + \ "\nNo avg_err in validation." + \ "\nLikely if you look back, the job was cancelled, so there's no cross validation.") avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception( "Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess( pctWrong, 8, "Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) if 1 == 0: # stuff from GLM1 classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def test_summary2_percentile(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 1, 'cD', 300), (100000, 2, 'cE', 300), ] timeoutSecs = 10 trial = 1 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print 'Trial:', trial SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname legalValues = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} # set. http://docs.python.org/2/library/stdtypes.html#set expectedMin = min(legalValues) expectedMax = max(legalValues) expectedUnique = (expectedMax - expectedMin) + 1 mode = 0.5 # rounding to nearest int will shift us from this for expected mean expectedMean = 0.5 expectedSigma = 0.5 write_syn_dataset(csvPathname, rowCount, colCount, low=expectedMin, high=expectedMax, mode=mode, SEED=SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename('.', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename summaryResult = h2o_cmd.runSummary(key=hex_key) if h2o.verbose: print "summaryResult:", h2o.dump_json(summaryResult) summaries = summaryResult['summaries'] scipyCol = 0 for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hbrk: self.assertIn(int(b), legalValues) self.assertEqual(len(hbrk), len(legalValues)) # self.assertAlmostEqual(hcnt[0], 0.5 * rowCount, delta=.01*rowCount) # self.assertAlmostEqual(hcnt[1], 0.5 * rowCount, delta=.01*rowCount) print "pctile:", pctile print "maxs:", maxs # we round to int, so we may introduce up to 0.5 rounding error? compared to "mode" target self.assertAlmostEqual(maxs[0], expectedMax, delta=0.01) print "mins:", mins self.assertAlmostEqual(mins[0], expectedMin, delta=0.01) for v in pctile: self.assertTrue(v >= expectedMin, "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin)) self.assertTrue(v <= expectedMax, "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax)) eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0] if expectedMin==1: eV = eV1 elif expectedMin==0: eV = [e-1 for e in eV1] elif expectedMin==2: eV = [e+1 for e in eV1] else: raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin) if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) scipyCol += 1
def test_GLM_convergence_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, 'cD', 300), (100, 100, 'cE', 300), (100, 200, 'cF', 300), (100, 300, 'cG', 300), (100, 400, 'cH', 300), (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) USEKNOWNFAILURE = True for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) if USEKNOWNFAILURE: csvFilename = 'failtoconverge_100x50.csv' csvPathname = h2o.find_file('smalldata/logreg/' + csvFilename) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 10, 'weight': 1.0, 'link': 'familyDefault', 'n_folds': 2, 'beta_epsilon': 1e-4, 'lambda': '1e-8:1e-3:1e2', 'alpha': '0,0.5,.75', 'thresholds': '0,1,0.2' } if USEKNOWNFAILURE: kwargs['y'] = 50 else: kwargs['y'] = y emsg = None for i in range(2): start = time.time() # get rid of the Jstack polling glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison warnings = h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs) # gets the failed to converge, here, after we see it in the browser too x = re.compile("[Ff]ailed") if warnings: for w in warnings: if (re.search(x,w)): # first if emsg is None: emsg = w print w if emsg: break if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLMGridProgress") time.sleep(5) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def test_parse_bounds_csv_fvec(self): print "Random 0/1 for col1. Last has max col = 1, All have zeros for class." # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 50, 'cC', 300), (1000, 999, 'cC', 300), (1000, 1000, 'cA', 300), # (1000, 100000, 'cB', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below synSumList = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE********************** parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT******************* inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) iCols = inspect['cols'] iStats = [] for stats in iCols: iName = stats['name'] # just touching to make sure they are there iNaCnt = stats['naCnt'] iMin = float(stats['min']) iMax = float(stats['max']) iMean = float(stats['mean']) iStats.append({ 'name': iName, 'naCnt': iNaCnt, 'min': iMin, 'max': iMax, 'mean': iMean, }) # SUMMARY******************************** summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=colCount, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) columnsList = summaryResult['summaries'] self.assertEqual( colCount, len(columnsList), msg= "generated %s cols (including output). summary has %s columns" % (colCount, len(columnsList))) c = 0 for column in columnsList: # get info from the inspect col for comparison iMin = iStats[c]['min'] iMax = iStats[c]['max'] iMean = iStats[c]['mean'] iNaCnt = iStats[c]['naCnt'] c += 1 colname = column['colname'] stats = column['stats'] stype = column['type'] hstep = column['hstep'] hbrk = column['hstep'] hstart = column['hstart'] smax = stats['maxs'] smin = stats['mins'] sd = stats['sd'] smean = stats['mean'] # no zeroes if enum, but we're not enum here zeros = stats['zeros'] self.assertEqual( iMin, smin[0], "inspect min %s != summary min %s" % (iMin, smin)) self.assertEqual( iMax, smax[0], "inspect max %s != summary max %s" % (iMax, smax)) self.assertEqual( iMean, smean, "inspect mean %s != summary mean %s" % (iMean, smean)) # no comparison for 'zeros' # now, also compare expected values if colname == "V1": synNa = 0 # can reverse-engineer the # of zeroes, since data is always 1 synSum = synSumList[ 1] # could get the same sum for all ccols synZeros = numRows - synSum synSigma = 0.50 synMean = (synSum + 0.0) / numRows synMin = [0.0, 1.0] synMax = [1.0, 0.0] elif colname == "V2": synSum = 0 synSigma = 0 synMean = 0 if DO_NAN: synZeros = 0 synNa = numRows synMin = [] synMax = [] else: synZeros = numRows synNa = 0 synMin = [0.0] synMax = [0.0] # a single 1 in the last col elif colname == "V" + str(colCount - 1): # h2o puts a "V" prefix synNa = 0 synSum = synSumList[colCount - 1] synZeros = numRows - 1 # stddev.p # http://office.microsoft.com/en-us/excel-help/stdev-p-function-HP010335772.aspx synMean = 1.0 / numRows # why does this need to be a 1 entry list synSigma = math.sqrt(pow((synMean - 1), 2) / numRows) print "last col with single 1. synSigma:", synSigma synMin = [0.0, 1.0] synMax = [1.0, 0.0] else: synNa = 0 synSum = 0 synZeros = numRows synSigma = 0.0 synMean = 0.0 synMin = [0.0] synMax = [0.0] if DO_MEAN: self.assertAlmostEqual( float(smean), synMean, places=6, msg='col %s mean %s is not equal to generated mean %s' % (colname, smean, synMean)) # why are min/max one-entry lists in summary result. Oh..it puts N min, N max self.assertTrue( smin >= synMin, msg='col %s min %s is not >= generated min %s' % (colname, smin, synMin)) self.assertTrue( smax <= synMax, msg='col %s max %s is not <= generated max %s' % (colname, smax, synMax)) # reverse engineered the number of zeroes, knowing data was always 1 if present? if colname == "V65536" or colname == "V65537": print "columns around possible zeros mismatch:", h2o.dump_json( columns) self.assertEqual( zeros, synZeros, msg='col %s zeros %s is not equal to generated zeros %s' % (colname, zeros, synZeros))
def test_rf_predict3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 1: y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # No translate because we're using an Exec to get the data out?, and that loses the encoding? translate = None expectedPctWrong = 0.0 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 else: y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4,\ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey + "=" + hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col( csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col( csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 2.0: raise Exception( "pctWrong too high. Expect < 2% error because it's reusing training data" ) return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) kwargs = { 'destination_key': 'rf_model', 'response': response, 'ntrees': trees, 'classification': 1, } rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=0.2, msg= "predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)