Пример #1
0
 def notest_RF_poker100(self):
     h2o.beta_features = True
     trees = 6
     timeoutSecs = 20
     csvPathname = 'poker/poker100'
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)
Пример #2
0
 def test_RFhhp(self):
     # NAs cause CM to zero..don't run for now
     csvPathname = 'hhp_9_17_12.predict.data.gz'
     parseResult = h2i.import_parse(bucket='smalldata',
                                    path=csvPathname,
                                    schema='put')
     h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=30)
Пример #3
0
    def test_rf_float_rand2_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10000
        write_syn_dataset(csvPathname, totalRows, headerData)

        for trial in range(5):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, num)
            totalRows += num
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            kwargs = {'ntrees': 5, 'max_depth': 5}
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            h2o_cmd.runRF(parseResult=parseResult,
                          timeoutSecs=60,
                          pollTimeoutSecs=60,
                          **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Пример #4
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

# two row dataset gets this. Avoiding it for now
# java.lang.ArrayIndexOutOfBoundsException: 1
# at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange (10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange (10,100,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
Пример #5
0
    def test_B_randomdata2_1_lineend(self):
        csvPathname = 'datagen1.csv'
        # change lineend, case 1
        csvPathname1 = h2i.find_folder_and_filename('smalldata',
                                                    csvPathname,
                                                    returnFullPath=True)
        print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending"
        csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv'

        infile = open(csvPathname1, 'r')
        outfile = open(csvPathname2, 'w')  # existing file gets erased

        # assume all the test files are unix lineend.
        # I guess there shouldn't be any "in-between" ones
        # okay if they change I guess.
        for line in infile.readlines():
            outfile.write(line.strip("\n") + "\r")
        infile.close()
        outfile.close()

        parseResult = h2i.import_parse(path=csvPathname2,
                                       schema='put',
                                       timeoutSecs=10,
                                       header=0,
                                       separator=44)

        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        numCols = inspect['numCols']
        h2o_cmd.runRF(parseResult=parseResult,
                      trees=1,
                      response='C' + str(numCols),
                      timeoutSecs=20)
Пример #6
0
    def test_B_randomdata2_1_lineend(self):
        csvPathname = 'datagen1.csv'
        # change lineend, case 1
        csvPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
        print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" 
        csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv'

        infile = open(csvPathname1, 'r') 
        outfile = open(csvPathname2,'w') # existing file gets erased

        # assume all the test files are unix lineend. 
        # I guess there shouldn't be any "in-between" ones
        # okay if they change I guess.
        for line in infile.readlines():
            outfile.write(line.strip("\n") + "\r")
        infile.close()
        outfile.close()

        parseResult = h2i.import_parse(path=csvPathname2, schema='put', 
            timeoutSecs=10, header=0, separator=44)

        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        numCols = inspect['numCols']
        h2o_cmd.runRF(parseResult=parseResult, 
            trees=1, 
            response_variable='C'+str(numCols),
            timeoutSecs=20)
Пример #7
0
    def test_rf_params_rand2_fvec(self):
        h2o.beta_features = True
        csvPathname = "standard/covtype.data"
        hex_key = "covtype.data.hex"
        for trial in range(10):
            # params is mutable. This is default.
            params = {"ntrees": 13, "mtries": 7}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            if "cols" in params and params["cols"]:
                pass
            else:
                if "ignored_cols_by_name" in params and params["ignored_cols_by_name"]:
                    params["mtries"] = random.randint(1, 53)
                else:
                    params["mtries"] = random.randint(1, 54)

            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + ((kwargs["ntrees"] * 80) * max(1, kwargs["mtries"] / 60))
            start = time.time()
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=hex_key
            )
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
Пример #8
0
 def notest_B_RF_iris2(self):
     csvPathname = h2o.find_file('smalldata/iris/iris2.csv')
     h2o_cmd.runRF(trees=6,
                   model_key="iris2",
                   timeoutSecs=10,
                   retryDelaySecs=1,
                   csvPathname=csvPathname)
Пример #9
0
 def test_rf_strata_fail(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     timeoutSecs = 60
     kwargs = {
         'response_variable': 54,
         'ntree': 50,
         'features': '',
         'depth': 2147483647,
         'stat_type': 'ENTROPY',
         'ignore': '',
         'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0',
         'sampling_strategy': 'RANDOM',
         'strata_samples':
         'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined',
         'sample': '67',
         'out_of_bag_error_estimate': 1,
         'model_key': '',
         'bin_limit': 1024,
         'seed': 784834182943470027,
         'parallel': 1,
         'exclusive_split_limit': '',
         'iterative_cm': 1,
         'use_non_local_data': 0,
     }
     h2o_cmd.runRF(timeoutSecs=timeoutSecs,
                   csvPathname=csvPathname,
                   **kwargs)
Пример #10
0
 def test_hex_443(self):
     h2o.beta_features = True
     csvPathname = 'hex-443.parsetmp_1_0_0_0.data'
     parseResult = h2i.import_parse(bucket='smalldata',
                                    path=csvPathname,
                                    schema='put')
     h2o_cmd.runRF(parseResult=parseResult, ntrees=1, timeoutSecs=5)
Пример #11
0
    def test_GenParity1(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

        # two row dataset gets this. Avoiding it for now
        # java.lang.ArrayIndexOutOfBoundsException: 1
        # at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange(10, 100, 10):
            shCmdString = "perl " + parityPl + " 128 4 " + str(
                x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange(10, 100, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trees,
                          timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
Пример #12
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to 
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            # FIX! TBD do we always have to kick off the run from node 0?
            # what if we do another node?
            # FIX! do we need or want a random delay here?
            h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs,
                    csvPathname=csvPathname)
            trees += 10
            sys.stdout.write('.')
            sys.stdout.flush()
Пример #13
0
 def test_A_randomdata2(self):
     print "Using smalldata/datagen1.csv as is"
     csvPathname = h2o.find_file('smalldata/datagen1.csv')
     h2o_cmd.runRF(trees=1,
                   response_variable=2,
                   timeoutSecs=10,
                   csvPathname=csvPathname)
Пример #14
0
 def test_A_randomdata2(self):
     print "Using datagen1.csv as-is"
     csvPathname = 'datagen1.csv'
     # have to give the separator == comma...otherwise H2O can't deduce it on this dataset
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
         timeoutSecs=10, header=1, separator=44)
     h2o_cmd.runRF(parseResult=parseResult, trees=1, response_variable=2, timeoutSecs=20)
Пример #15
0
    def tryThemAll(self,set,rows):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            for tokenCase in range(len(self.tokenChangeDict)):
                newRows1 = self.changeTokens(rows,tokenCase)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    if "'" in self.tokenChangeDict[tokenCase][0]:
                        single_quotes = 1
                    else:
                        single_quotes = 0
                    parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes,
                        noPrint=not h2o.verbose)

                    if DO_RF:
                        h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=30, retryDelaySecs=0.1)
                    h2o.verboseprint("Set", set)
                    sys.stdout.write('.')
                    sys.stdout.flush()
Пример #16
0
 def test_rf_strata_fail(self):
     csvPathname ='UCI/UCI-large/covtype/covtype.data'
     timeoutSecs = 60
     kwargs = {
         'response_variable': 54,
         'ntree': 50,
         'features': '',
         'depth': 2147483647,
         'stat_type': 'ENTROPY',
         'ignore': '',
         'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0',
         'sampling_strategy': 'RANDOM',
         'strata_samples': 'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined',
         'sample': '67',
         'out_of_bag_error_estimate': 1,
         'model_key': '',
         'bin_limit': 1024,
         'seed': 784834182943470027,
         'parallel': 1,
         'exclusive_split_limit': '', 
         'iterative_cm': 1,
         'use_non_local_data': 0,
     }
     parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put')
     h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
Пример #17
0
 def test_A_c1_fvec(self):
     h2o.beta_features = True
     parseResult = h2i.import_parse(bucket='smalldata',
                                    path='iris/iris2.csv',
                                    schema='put',
                                    timeoutSecs=60)
     h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=60)
Пример #18
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl',
                                   shCmdString.split(),
                                   timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            h2o_cmd.runRF(trees=trees,
                          timeoutSecs=timeoutSecs,
                          csvPathname=csvPathname)
            trees += 10
Пример #19
0
 def test_poker_xlsx(self):
     # maybe can get stuck during polling for parse progress?
     # break it out for pollTimeoutSecs
     parseResult = h2i.import_parse(
         bucket="datasets", path="poker/poker-hand-testing.xlsx", schema="put", timeoutSecs=120, pollTimeoutSecs=60
     )
     h2o_cmd.runRF(None, parseResult=parseResult, timeoutSecs=120)
Пример #20
0
    def test_rf_big_rand_tree_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowCount = 5000
        colCount = 1000
        write_syn_dataset(csvPathname, rowCount, colCount)

        for trial in range (1):
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            seed = random.randint(0,sys.maxint)
            # some cols can be dropped due to constant 0 or 1. make sure data set has all 0's and all 1's above
            # to guarantee no dropped cols!
            # kwargs = {'ntree': 3, 'depth': 50, 'seed': seed}
            # out of memory/GC errors with the above. reduce depth
            kwargs = {'ntrees': 3, 'max_depth': 20, 'seed': seed}
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=90)
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=600, pollTimeoutSecs=180, **kwargs)
            print "trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key)
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            cols = inspect['cols']
            numCols = inspect['numCols']
            for i,c in enumerate(cols):
                colType = c['type']
                self.assertEqual(colType, 'Int', msg="col %d should be type in: %s" % (i, colType))

            h2o.check_sandbox_for_errors()
Пример #21
0
 def test_D_GenParity1(self):
     trees = 50
     h2o_cmd.runRF(
         trees=50,
         timeoutSecs=15,
         csvPathname=h2o.find_file('smalldata/parity_128_4_100_quad.data'),
         noise=('StoreView', None))
Пример #22
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED =
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + (
             (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) *
             (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs,
                       retryDelaySecs=1,
                       csvPathname=csvPathname,
                       **kwargs)
         elapsed = time.time() - start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
             (elapsed * 100) / timeoutSecs)
Пример #23
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
Пример #24
0
    def tryThemAll(self, set, rows, enumsOnly=False):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            if enumsOnly:
                tcd = self.tokenChangeDict
            else:
                tcd = self.tokenChangeDictEnumsOnly

            for tokenCase in range(len(tcd)):
                newRows1 = self.changeTokens(rows, tokenCase, tcd)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    if "'" in tcd[tokenCase][0]:
                        singleQuotes = 1
                    else:
                        singleQuotes = 0
                    parseResult = h2i.import_parse(path=csvPathname, schema='local', singleQuotes=singleQuotes,
                        noPrint=not h2o_args.verbose, retryDelaySecs=0.1, 
                        doSummary=DO_SUMMARY, intermediateResults=DO_INTERMEDIATE_RESULTS)

                    if DO_RF:
                        h2o_cmd.runRF(parseResult=parseResult, trees=1,
                            timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True)
                    verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
Пример #25
0
    def test_badchars(self):
        print "badchars.csv has some 0x0 (<NUL>) characters."
        print "They were created by a dd that filled out to buffer boundary with <NUL>"
        print "They are visible using vim/vi"

        csvPathname = h2o.find_file('smalldata/badchars.csv')
        h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname)
Пример #26
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = (
                "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR
            )
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 3):
            sys.stdout.write(".")
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30
            )

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3)
            print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds"

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Пример #27
0
    def tryThemAll(self,set,rows):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            for tokenCase in range(len(self.tokenChangeDict)):
                newRows1 = self.changeTokens(rows,tokenCase)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    parseResult = h2i.import_parse(path=csvPathname, schema='local', noPrint=not h2o.verbose)
                    inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
                    print "\n" + csvPathname, \
                        "    num_rows:", "{:,}".format(inspect['num_rows']), \
                        "    num_cols:", "{:,}".format(inspect['num_cols'])
                    num_rows = inspect['num_rows']
                    num_cols = inspect['num_cols']
                    self.assertEqual(num_cols, 4, "Parsed wrong number of cols: %s" % num_cols)
                    self.assertEqual(num_rows, 29, "Parsed wrong number of rows: %s" % num_rows)

                    h2o_cmd.runRF(parseResult=parseResult, trees=1, 
                        timeoutSecs=10, retryDelaySecs=1.0, noPrint=True)
                    h2o.verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
Пример #28
0
    def test_rf_1ktrees_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [500]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Пример #29
0
    def tryThemAll(self, set, rows, enumsOnly=False):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            if enumsOnly:
                tcd = self.tokenChangeDict
            else:
                tcd = self.tokenChangeDictEnumsOnly

            for tokenCase in range(len(tcd)):
                newRows1 = self.changeTokens(rows, tokenCase, tcd)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    parseResult = h2i.import_parse(path=csvPathname, schema='put', noPrint=not h2o.verbose)
                    h2o_cmd.runRF(parseResult=parseResult, trees=1,
                        timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=False)
                    h2o.verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
Пример #30
0
    def tryThemAll(self, set, rows):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            for tokenCase in range(len(self.tokenChangeDict)):
                newRows1 = self.changeTokens(rows, tokenCase)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1, sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname, newRows2, eol)
                    if "'" in self.tokenChangeDict[tokenCase]:
                        single_quotes = 1
                    else:
                        single_quotes = 0
                    parseResult = h2i.import_parse(path=csvPathname,
                                                   schema='put',
                                                   single_quotes=single_quotes,
                                                   noPrint=not h2o.verbose)

                    h2o_cmd.runRF(parseResult=parseResult,
                                  trees=1,
                                  timeoutSecs=30,
                                  retryDelaySecs=0.1)
                    h2o.verboseprint("Set", set)
                    sys.stdout.write('.')
                    sys.stdout.flush()
    def test_1ktrees_job_cancel_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "Kick off twenty, then cancel them all..there's a timeout on the wait after cancelling"
        for trial in range (1,20):
            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, depth=50, rfView=False, noPoll=True,
                timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'


        h2o.check_sandbox_for_errors()
        h2o_jobs.cancelAllJobs(timeoutSecs=10)
Пример #32
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in xrange (1,10,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put')

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
Пример #33
0
    def test_cs_test(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='kaggle/creditsample-training.csv.gz', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, ntrees=5, max_depth=100, timeoutSecs=500,
            response='SeriousDlqin2yrs')
        # h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        time.sleep(5)
Пример #34
0
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        
        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, 
            hex_key=hex_key, timeoutSecs=30, schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, 
                timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            ntree = rfView['ntree']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
            if first is None: # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
Пример #35
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(50, 200, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # bump this up too if you do?
        # always match the gen above!
        ### for x in xrange (50,200,10):
        for x in xrange(50, 200, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + "100" + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            h2o_cmd.runRF(csvPathname=csvPathname,
                          trees=100,
                          timeoutSecs=5,
                          retryDelaySecs=0.1)
Пример #36
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult,
                          ntrees=trees,
                          timeoutSecs=timeoutSecs)
            trees += 10
Пример #37
0
 def test_RFhhp(self):
     csvPathnamegz = h2o.find_file('smalldata/hhp_107_01.data.gz')
     print "\nRF start on ", csvPathnamegz, "this will probably take a minute.."
     start = time.time()
     h2o_cmd.runRF(csvPathname=csvPathnamegz, trees=23,
             timeoutSecs=120, retryDelaySecs=10)
     print "RF end on ", csvPathnamegz, 'took', time.time() - start, 'seconds'
Пример #38
0
 def tryThemAll(self, set, rows):
     for eolCase in range(len(self.eolDict)):
         eol = self.eolDict[eolCase]
         # change tokens must be first
         for tokenCase in range(len(self.tokenChangeDict)):
             newRows1 = self.changeTokens(rows, tokenCase)
             for sepCase in range(len(self.sepChangeDict)):
                 newRows2 = self.changeSep(newRows1, sepCase)
                 csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                     str(set) + "_" + \
                     str(eolCase) + "_" + \
                     str(tokenCase) + "_" + \
                     str(sepCase) + \
                     '.data'
                 self.writeRows(csvPathname, newRows2, eol)
                 h2o_cmd.runRF(trees=1,
                               csvPathname=csvPathname,
                               timeoutSecs=10,
                               retryDelaySecs=0.1,
                               noPrint=True,
                               print_params=False)
                 h2o.verboseprint("Set", set)
                 h2o.check_sandbox_for_errors()
                 sys.stdout.write('.')
                 sys.stdout.flush()
Пример #39
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            # FIX! TBD do we always have to kick off the run from node 0?
            # what if we do another node?
            # FIX! do we need or want a random delay here?
            h2o_cmd.runRF(trees=trees,
                          timeoutSecs=timeoutSecs,
                          csvPathname=csvPathname)
            trees += 10
            sys.stdout.write('.')
            sys.stdout.flush()
    def test_1ktrees_job_cancel_many_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range (1,5):
            # random 0 or 1 delay
            delay = random.uniform(0,1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3)
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
Пример #41
0
 def notest_RF_iris2(self):
     trees = 6
     timeoutSecs = 20
     csvPathname = h2o.find_file('smalldata/iris/iris2.csv')
     h2o_cmd.runRF(trees=trees,
                   timeoutSecs=timeoutSecs,
                   csvPathname=csvPathname)
Пример #42
0
    def test_rf3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in range (1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put', pollTimeoutSecs=60, timeoutSecs=60)

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, max_depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
Пример #43
0
 def notest_RF_poker100(self):
     trees = 6
     timeoutSecs = 20
     csvPathname = h2o.find_file('smalldata/poker/poker100')
     h2o_cmd.runRF(trees=trees,
                   timeoutSecs=timeoutSecs,
                   csvPathname=csvPathname)
Пример #44
0
 def test_B_c1_fvec(self):
     print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
     print "Want to be able to run python as jenkins"
     print "I guess for big 0xcust files, we don't need schema='put'"
     print "For files that we want to put (for testing put), we can get non-private files"
     parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path='c1/iris2.csv', schema='local', timeoutSecs=60)
     h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=60)
Пример #45
0
    def test_rf_params_rand2_ncaa(self):
        csvPathname = 'ncaa/Players.csv'
        for trial in range(4):
            # params is mutable. This is default.
            params = {'ntree': 13, 'features': 4}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + (
                (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15))

            # hack to NA the header (duplicate header names)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='put',
                                           header=0)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          timeoutSecs=timeoutSecs,
                          retryDelaySecs=1,
                          **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)
Пример #46
0
 def test_badchars(self):
     print "badchars.csv has some 0x0 (<NUL>) characters."
     print "They were created by a dd that filled out to buffer boundary with <NUL>"
     print "They are visible using vim/vi"
     
     csvPathname = h2o.find_file('smalldata/badchars.csv')
     h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname)
Пример #47
0
 def tryThemAll(self, set, rows):
     for eolCase in range(len(self.eolDict)):
         eol = self.eolDict[eolCase]
         # change tokens must be first
         for tokenCase in range(len(self.tokenChangeDict)):
             newRows1 = self.changeTokens(rows, tokenCase)
             for sepCase in range(len(self.sepChangeDict)):
                 (newSep, newRows2) = self.changeSep(newRows1, sepCase)
                 csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                     str(set) + "_" + \
                     str(eolCase) + "_" + \
                     str(tokenCase) + "_" + \
                     str(sepCase) + \
                     '.data'
                 self.writeRows(csvPathname, newRows2, eol)
                 # give h2o the separator, to be nice. (integerized)
                 parseResult = h2i.import_parse(path=csvPathname,
                                                schema='put',
                                                separator=ord(newSep),
                                                noPrint=not h2o.verbose)
                 h2o_cmd.runRF(parseResult=parseResult,
                               trees=1,
                               response_variable='C1',
                               timeoutSecs=10,
                               retryDelaySecs=0.1,
                               noPrint=True)
                 h2o.verboseprint("Set", set)
                 h2o.check_sandbox_for_errors()
                 sys.stdout.write('.')
                 sys.stdout.flush()
Пример #48
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
Пример #49
0
    def test_rf_float_rand2_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10000
        write_syn_dataset(csvPathname, totalRows, headerData)

        for trial in range (5):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, num)
            totalRows += num
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            kwargs = {'ntrees': 5, 'max_depth': 5}
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=60, pollTimeoutSecs=60, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Пример #50
0
 def test_poker_xlsx(self):
     parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                    path='xls/poker-hand-testing.xlsx',
                                    schema='put',
                                    timeoutSecs=120,
                                    pollTimeoutSecs=60)
     h2o_cmd.runRF(None, parseResult=parseResult, timeoutSecs=120)
Пример #51
0
 def notest_C_RF_poker100(self):
     # RFview consumes cycles. Only retry once a second, to avoid slowing things down
     csvPathname = h2o.find_file('smalldata/poker/poker100')
     h2o_cmd.runRF(trees=6,
                   model_key="poker100",
                   timeoutSecs=10,
                   retryDelaySecs=1,
                   csvPathname=csvPathname)
Пример #52
0
 def test_cs_training(self):
     h2o_cmd.runRF(trees=100,
                   depth=100,
                   csvPathname=h2o.find_file(
                       'smalldata/kaggle/creditsample-training.csv.gz'),
                   timeoutSecs=300,
                   response_variable=1)
     h2b.browseJsonHistoryAsUrlLastMatch("RFView")
Пример #53
0
 def test_stedo_testing_data(self):
     csvPathname = h2o.find_file('smalldata/stego/stego_training.data')
     # Prediction class is the second column => class=1
     h2o_cmd.runRF(trees=50,
                   timeoutSecs=30,
                   csvPathname=csvPathname,
                   response_variable=1,
                   out_of_bag_error_estimate=1)
Пример #54
0
 def test_badchars(self):
     print "badchars.csv has some 0x0 (<NUL>) characters."
     print "They were created by a dd that filled out to buffer boundary with <NUL>"
     print "They are visible using vim/vi"
     
     csvPathname = 'badchars.csv'
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     h2o_cmd.runRF(parseResult=parseResult, trees=50, timeoutSecs=10)
Пример #55
0
    def test_tree_view(self):
        parseResult = h2i.import_parse(
            bucket="smalldata", path="poker/poker1000", hex_key="poker1000.hex", schema="put"
        )
        h2o_cmd.runRF(parseResult=parseResult, trees=50, model_key="model0", timeoutSecs=10)

        for n in range(1):
            a = h2o_cmd.runRFTreeView(n=n, data_key="poker1000.hex", model_key="model0", timeoutSecs=10)
            print(h2o.dump_json(a))
Пример #56
0
    def test_RFhhp(self):
        csvPathname = 'hhp.cut3.214.data.gz'

        print "RF start on ", csvPathname, "this will probably take 1 minute.."
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=10,
                timeoutSecs=400, retryDelaySecs=15)
        print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
Пример #57
0
    def test_rf_200x4_fvec(self):
        csvPathname = 'hhp.cut3.214.data.gz'

        print "RF start on ", csvPathname
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        h2o_cmd.runRF(parseResult=parseResult, ntrees=3,
                timeoutSecs=1200, retryDelaySecs=15)
        print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
Пример #58
0
 def test_rf_params_rand1_fvec(self):
     csvPathname = 'poker/poker1000'
     for trial in range(10):
         # params is mutable. This is default.
         kwargs = params.copy()
         timeoutSecs = 180
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"