def test_nulls_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # we're going to insert <NUL> (0x0) in between every byte! # and then use it. move to a large file. I suppose # we could compare the results to a non-munged file with the same algo # I suppose the <NUL> are thrown away by parse, so doesn't change # chunk boundary stuff. (i.e. not interesting test for RF) csvFilename = 'poker1000' csvPathname = 'poker/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) nulFilename = "syn_nul.data" nulPathname = SYNDATASETS_DIR + '/' + nulFilename piece_size = 4096 # 4 KiB with open(fullPathname, "rb") as in_file: with open(nulPathname, "wb") as out_file: while True: piece = in_file.read(103) if piece == "": break # end of file # we could just extend piece? # start with a null withNuls = bytearray(piece) # FIX! we'll eventually stick a <NUL> after every byte! withNuls.extend(bytearray.fromhex('00')) out_file.write(withNuls) for trials in xrange(1, 2): trees = 6 for x in xrange(161, 240, 40): y = 10000 * x print "\nTrial:", trials, ", y:", y timeoutSecs = 20 + 5 * (len(h2o.nodes)) model_key = csvFilename + "_" + str(trials) parseResult = h2i.import_parse(path=nulPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, destination_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) sys.stdout.write('.') sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_B_GenParity1(self): # Create a directory for the created dataset files. ok if already exists SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR, print "\nmay be a minute.........." for x in xrange(161, 240, 20): # more rows! y = 10000 * x # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. # UPDATE: maybe EC2 takes a long time to spawn a process? h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=90) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" sys.stdout.write('.') sys.stdout.flush() print "\nDatasets generated. Using." # always match the gen above! # Let's try it twice! for trials in xrange(1, 7): # prime trees = 6 for x in xrange(161, 240, 20): y = 10000 * x print "\nTrial:", trials, ", y:", y csvFilename = "parity_128_4_" + str(y) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # random guess about length of time, varying with more hosts/nodes? timeoutSecs = 20 + 5 * (len(h2o.nodes)) # change the model name each iteration, so they stay in h2o model_key = csvFilename + "_" + str(trials) h2o_cmd.runRF(trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname) sys.stdout.write('.') sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_B_GenParity1(self): # Create a directory for the created dataset files. ok if already exists SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR, print "\nmay be a minute.........." for x in xrange(161, 240, 20): # more rows! y = 10000 * x # Have to split the string out to list for pipe shCmdString = ( "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad " + SYNDATASETS_DIR ) # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. # UPDATE: maybe EC2 takes a long time to spawn a process? h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), timeout=90) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" sys.stdout.write(".") sys.stdout.flush() print "\nDatasets generated. Using." # always match the gen above! # Let's try it twice! for trials in xrange(1, 7): # prime trees = 6 for x in xrange(161, 240, 20): y = 10000 * x print "\nTrial:", trials, ", y:", y csvFilename = "parity_128_4_" + str(y) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # random guess about length of time, varying with more hosts/nodes? timeoutSecs = 20 + 5 * (len(h2o.nodes)) # change the model name each iteration, so they stay in h2o model_key = csvFilename + "_" + str(trials) parseResult = h2i.import_parse(path=csvPathname, schema="put") h2o_cmd.runRF( parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1 ) sys.stdout.write(".") sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_file_with_nul_chars_inserted(self): SYNDATASETS_DIR = h2o.make_syn_dir() # we're going to insert <NUL> (0x0) in between every byte! # and then use it. move to a large file. I suppose # we could compare the results to a non-munged file with the same algo # I suppose the <NUL> are thrown away by parse, so doesn't change # chunk boundary stuff. (i.e. not interesting test for RF) csvFilename = "poker1000" csvPathname = "poker/" + csvFilename fullPathname = h2i.find_folder_and_filename("smalldata", csvPathname, returnFullPath=True) nulFilename = "syn_nul.data" nulPathname = SYNDATASETS_DIR + "/" + nulFilename piece_size = 4096 # 4 KiB with open(fullPathname, "rb") as in_file: with open(nulPathname, "wb") as out_file: while True: piece = in_file.read(103) if piece == "": break # end of file # we could just extend piece? # start with a null withNuls = bytearray(piece) # FIX! we'll eventually stick a <NUL> after every byte! withNuls.extend(bytearray.fromhex("00")) out_file.write(withNuls) for trials in xrange(1, 2): trees = 6 for x in xrange(161, 240, 40): y = 10000 * x print "\nTrial:", trials, ", y:", y timeoutSecs = 20 + 5 * (len(h2o.nodes)) model_key = csvFilename + "_" + str(trials) parseResult = h2i.import_parse(path=nulPathname, schema="put") h2o_cmd.runRF( parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1 ) sys.stdout.write(".") sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()