예제 #1
0
    def test_sort_of_prostate_with_row_schmoo(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10000
        write_syn_dataset(csvPathname, totalRows, headerData)

        for trial in range (5):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, num)
            totalRows += num
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            kwargs = {'ntree': 5, 'depth': 5}
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRFOnly(parseResult=parseResult, hex_key=hex_key, 
                timeoutSecs=10, pollTimeoutSecs=5, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
예제 #2
0
 def test_rf_strata_fail(self):
     csvPathname ='UCI/UCI-large/covtype/covtype.data'
     timeoutSecs = 60
     kwargs = {
         'response_variable': 54,
         'ntree': 50,
         'features': '',
         'depth': 2147483647,
         'stat_type': 'ENTROPY',
         'ignore': '',
         'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0',
         'sampling_strategy': 'RANDOM',
         'strata_samples': 'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined',
         'sample': '67',
         'out_of_bag_error_estimate': 1,
         'model_key': '',
         'bin_limit': 1024,
         'seed': 784834182943470027,
         'parallel': 1,
         'exclusive_split_limit': '', 
         'iterative_cm': 1,
         'use_non_local_data': 0,
     }
     parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put')
     h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
예제 #3
0
파일: test_rf1.py 프로젝트: selcukgun/h2o
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

# two row dataset gets this. Avoiding it for now
# java.lang.ArrayIndexOutOfBoundsException: 1
# at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange (10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange (10,100,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRFOnly(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
예제 #4
0
파일: test_rf3.py 프로젝트: NidhiMehta/h2o
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in xrange (1,10,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseKey = h2o_cmd.parseFile(None, csvPathname)

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRFOnly(parseKey=parseKey, trees=237, depth=45, timeoutSecs=120)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
예제 #5
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,3):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRFOnly(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
예제 #6
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in xrange (1,5,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            key = h2o.nodes[0].put_file(csvPathname)
            parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex")

            h2o.verboseprint("Trial", trial)
            start = time.time()
            cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
예제 #7
0
 def test_A_randomdata2(self):
     print "Using datagen1.csv as-is"
     csvPathname = 'datagen1.csv'
     # have to give the separator == comma...otherwise H2O can't deduce it on this dataset
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
         timeoutSecs=10, header=1, separator=44)
     h2o_cmd.runRFOnly(parseResult=parseResult, trees=1, response_variable=2, timeoutSecs=20)
예제 #8
0
 def test_badchars(self):
     print "badchars.csv has some 0x0 (<NUL>) characters."
     print "They were created by a dd that filled out to buffer boundary with <NUL>"
     print "They are visible using vim/vi"
     
     csvPathname = 'badchars.csv'
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     h2o_cmd.runRFOnly(parseResult=parseResult, trees=50, timeoutSecs=10)
예제 #9
0
    def test_RFhhp(self):
        csvPathname = 'hhp.cut3.214.data.gz'

        print "RF start on ", csvPathname, "this will probably take 1 minute.."
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        h2o_cmd.runRFOnly(parseResult=parseResult, trees=200,
                timeoutSecs=400, retryDelaySecs=15)
        print "RF end on ", csvPathnamegz, 'took', time.time() - start, 'seconds'
예제 #10
0
    def test_tree_view_wrong_model(self):
        csvPathname = 'poker/poker1000'
        hex_key = csvPathname + ".hex"
        # tree view failed with poker1000, passed with iris
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
        h2o_cmd.runRFOnly(parseResult=parseResult, trees=1, model_key="model0", timeoutSecs=10)

        for n in range(1):
            # Give it the wrong model_key name. This caused a stack track
            a = h2o_cmd.runRFTreeView(n=n, 
                data_key=hex_key, model_key="wrong_model_name", timeoutSecs=10, ignoreH2oError=True)
 def test_loop_random_param_covtype(self):
     csvPathname = 'UCI/UCI-large/covtype/covtype.data'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put')
         h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
예제 #12
0
    def test_RFhhp(self):
        csvPathname = 'hhp_107_01.data.gz'

        print "\nRF start on ", csvPathname, "this will probably take a minute.."
        start = time.time()
        kwargs = {
            'class_weights': '0=1,1=10',
        }

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        h2o_cmd.runRFOnly(parseResult=parseResult, trees=100, timeoutSecs=120, retryDelaySecs=10, **kwargs)
        print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
예제 #13
0
 def test_loop_random_param_poker1000(self):
     csvPathname = 'poker/poker1000'
     for trial in range(20):
         # params is mutable. This is default.
         params = {'ntree': 19, 'parallel': 1}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5)
         h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs, **kwargs)
         h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
예제 #14
0
 def test_rf_params_rand2(self):
     csvPathname = 'UCI/UCI-large/covtype/covtype.data'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         start = time.time()
         parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put')
         h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
예제 #15
0
파일: test_tnc3.py 프로젝트: selcukgun/h2o
    def test_tnc3_ignore(self):
        csvPathname = 'tnc3.csv'
        print "\n" + csvPathname
        hex_key = "tnc3.hex"
        ### h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', 
            timeoutSecs=10, header=1)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if 1==1:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRFOnly(parseResult=parseResult, trees=5, timeoutSecs=10, ignore="boat,body")

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if 1==0:
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if 1==1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRFOnly(parseResult=parseResult, trees=5, timeoutSecs=10)

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
예제 #16
0
파일: test_nulls.py 프로젝트: selcukgun/h2o
    def test_file_with_nul_chars_inserted(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # we're going to insert <NUL> (0x0) in between every byte!
        # and then use it. move to a large file. I suppose
        # we could compare the results to a non-munged file with the same algo
        # I suppose the <NUL> are thrown away by parse, so doesn't change
        # chunk boundary stuff. (i.e. not interesting test for RF)
        csvFilename = 'poker1000'
        csvPathname = 'poker/' + csvFilename
        fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=true)

        nulFilename = "syn_nul.data"
        nulPathname = SYNDATASETS_DIR + '/' + nulFilename

        piece_size = 4096 # 4 KiB

        with open(fullPathname, "rb") as in_file:
            with open(nulPathname, "wb") as out_file:
                while True:
                    piece = in_file.read(103)
                    if piece == "":
                        break # end of file

                    # we could just extend piece?
                    # start with a null
                    withNuls = bytearray(piece)
                    # FIX! we'll eventually stick a <NUL> after every byte!
                    withNuls.extend(bytearray.fromhex('00'))
                    out_file.write(withNuls)


        for trials in xrange(1,2):
            trees = 6
            for x in xrange (161,240,40):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                timeoutSecs = 20 + 5*(len(h2o.nodes))
                model_key = csvFilename + "_" + str(trials)

                parseResult = h2i.import_parse(path=nulPathname, schema='put')
                h2o_cmd.runRFOnly(parseResult = trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1)
                sys.stdout.write('.')
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
예제 #17
0
    def test_RF_1000trees(self):
        # NAs cause CM to zero..don't run for now
        ### csvPathnamegz = h2o.find_file('smalldata/hhp_9_17_12.predict.100rows.data.gz')
        s3bucket = self.s3_default_bucket()
        s3dataset = "covtype20x.data.gz"
        s3dataset = "covtype.data"
        s3dataset = "covtype200x.data.gz"
        s3dataset = "covtype50x.data"
        s3dataset = "covtype100x.data"
        s3dataset = "covtype.20k.data"

        s3dataset = "covtype.data"

        start = time.time()
        parseKey = h2o_cmd.parseS3File(bucket=s3bucket, filename=s3dataset, timeoutSecs=14800)
        print "Parsing took {0}".format(time.time() - start)

        start = time.time()
        rf_train = h2o_cmd.runRFOnly(
            parseKey=parseKey,
            ntree=100,
            timeoutSecs=14800,
            bin_limit=20000,
            out_of_bag_error_estimate=1,
            gini=0,
            depth=100,
            exclusive_split_limit=0,
        )
        print "Computation took {0} sec".format(time.time() - start)
        print h2o_rf.pp_rf_result(rf_train)
예제 #18
0
    def test_from_import(self):
        timeoutSecs = 500
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            ]

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for csvFilename in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir 
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='put',
                hex_key=hex_key, timeoutSecs=500)
            if not h2o.beta_features:
                print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            if not h2o.beta_features:
                RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs)

            ## h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            ## time.sleep(10)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
예제 #19
0
파일: test_hdfs.py 프로젝트: NidhiMehta/h2o
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "TEST-poker1000.csv",
            "leads.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            # these can't RF ..output classes not integer?
            # "bestbuy_test.csv",
            # "bestbuy_train.csv",
            "covtype.data",
            "covtype.4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            # "prostate_2g.csv",
            # "prostate_long.csv.gz",
            "prostate_long_1G.csv",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            # "poker_c1s1_testing_refresh.csv",
            # "3G_poker_shuffle",
            # "billion_rows.csv.gz",
            # "poker-hand.1244M.shuffled311M.full.txt",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
예제 #20
0
    def test_rf_params_rand2(self):
        csvPathname = 'UCI/UCI-large/covtype/covtype.data'
        kwargs = {
            'response_variable': 54, 
            'features': 7, 
            'sampling_strategy': 'STRATIFIED_LOCAL', 
            'out_of_bag_error_estimate': 1, 
            'strata_samples': '1=10,2=99,3=99,4=99,5=99,6=99,7=99', 
            'bin_limit': None, 
            'seed': '11111', 
            'model_key': '012345', 
            'ntree': 13, 
            'parallel': 1
        }
        for trial in range(2):

            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
            start = time.time()
            parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put')
            rfv = h2o_cmd.runRFOnly(parseResult=parseResult, 
                timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
            elapsed = time.time()-start

            cm = rfv['confusion_matrix']
            classification_error = cm['classification_error']
            rows_skipped = cm['rows_skipped']

            # just want to catch the nan case when all rows are skipped
            self.assertLess(rows_skipped, 581012)
            self.assertLess(classification_error, 100) # error if nan
            print "Trial #", trial, "completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
예제 #21
0
    def test_putfile_a5m(self):
        timeoutSecs = 500
        csvFilenameList = [
            # use different names for each parse 
            # doesn't fail if gzipped?
            ("a5m.csv", 'A', None),
            ("a5m.csv", 'B', None),
            ("a5m.csv", 'C', None),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, key, trees) in csvFilenameList:
            csvPathname = csvFilename

            # creates csvFilename and csvFilename.hex  keys
            parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult,
                    timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
예제 #22
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameList = [
            "airlines_88_08_100lines.csv",
        ]

        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
예제 #23
0
    def test_rf_allyears2k_oobe(self):
        importFolderPath = '/home/0xdiag/datasets'
        csvFilename = 'allyears2k.csv'
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        for trial in range(10):
            kwargs = paramDict
            timeoutSecs = 30 + kwargs['ntree'] * 2

            start = time.time()
            # randomize the node
            node = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix']['classification_error']
            rows_skipped = rfView['confusion_matrix']['rows_skipped']
            mtry = rfView['mtry']
            mtry_nodes = rfView['mtry_nodes']
            print "mtry:", mtry
            print "mtry_nodes:", mtry_nodes
            self.assertEqual(classification_error, 0, "Should have zero oobe error")
            self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped")

            print "Trial #", trial, "completed"
예제 #24
0
    def test_B_putfile_files(self):
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameList = [
            ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, csvPathname, trees) in csvFilenameList:
            parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, timeoutSecs=500, schema='put')
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            # We should be able to see the parse result?
            inspect2 = h2o_cmd.runInspect(key=parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult,
                    timeoutSecs=timeoutSecs)

            sys.stdout.write('.')
            sys.stdout.flush() 
예제 #25
0
 def test_rf_params_rand2(self):
     csvPathname = 'space_shuttle_damage.csv'
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'stat_type': 'ENTROPY',
             'class_weights': 'yes=1000',
             'ntree': 50, 
             'parallel': 1, 
             'response_variable': 'damage', 
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params 
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params 
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         # seems ec2 can be really slow
         timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10)
         start = time.time()
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
         rfView = h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         # just to get the list of per class errors
         (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
         # why does this vary between 22 and 23
         self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes'
         self.assertLess(classErrorPctList[0],95) # class 0 is 'no'
         self.assertLess(classErrorPctList[1],29) # class 1 is 'yes'
         self.assertLess(classification_error,61)
예제 #26
0
    def test_F_no_mc_loop(self):
        print "\nwith flatfile, with multicast disabled, and RF, 5 trials"
        allAcceptIptables()
        multicastDropReceiveIptables()
        showIptables()

        for x in range(1,5):
            h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True)
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', schema='put')
            h2o_cmd.runRFOnly(parseResult=parseResult, trees=50, timeoutSecs=10)
            h2o.tear_down_cloud()
            h2o.verboseprint("Waiting", nodes_per_host,
                "seconds to avoid OS sticky port problem")
            time.sleep(nodes_per_host)
            print "Trial", x
            sys.stdout.write('.')
            sys.stdout.flush()
예제 #27
0
    def test_rf_sample(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"

        print "just going to see if rf is doing the sampling right for one tree on 100000 rows"
        rList = rand_rowData()
        totalRows = 10000
        write_syn_dataset(csvPathname, totalRows, headerData, rList)

        for trial in range (2):
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            key = csvFilename + "_" + str(trial)
            key2 = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            timeoutSecs = 30
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, 
                timeoutSecs=timeoutSecs, pollTimeoutSecs=30, header=1)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {'sample': 75, 'depth': 25, 'ntree': 1}
            start = time.time()
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "trial #", trial, "totalRows:", totalRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            cm = rfv['confusion_matrix']
            rows_skipped = cm['rows_skipped']

            # the sample is what we trained on. The CM for one tree is what's left
            # it's not perfectly accurate..allow +-2
            # NEW: after the # of trees is big enough, all the data is used, so we really can't compare
            # any more
            sample = kwargs['sample']
            rowsUsed = sample * totalRows/100
            rowsNotUsed = totalRows - rowsUsed

            ## print "Allowing delta of 0-2"
            ## print "predicted CM rows (rowsNotUsed):", rowsNotUsed, "actually:", totalRows - rows_skipped, "rows_skipped:", rows_skipped
            ## self.assertAlmostEqual(rowsNotUsed, totalRows - rows_skipped, delta=2)

            h2o.check_sandbox_for_errors()
예제 #28
0
    def test_parse_bounds_libsvm(self):
        # just do the import folder once

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("mnist_train.svm", "cM", 30, 1),
            # FIX! fails KMeansScore
            # not integer output
            # ("colon-cancer.svm",   "cA", 30, 1),
            ("connect4.svm",       "cB", 30, 1),
            ("syn_6_1000_10.svm",  "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
            ("mushrooms.svm",      "cG", 30, 1),
            ("duke.svm",           "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm",  "cF", 30, 1),
            ("news20.svm",         "cH", 30, 1),

            ("tmc2007_train.svm",  "cJ", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            bucket = "home-0xdiag-datasets"
            csvPathname = "libsvm/" + csvFilename

            # PARSE******************************************
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000)
            print csvPathname, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            # RF******************************************
            kwargs = {
                'ntree': 6,
                'response_variable': 0,
            }

            timeoutSecs = 600
            start = time.time()
            rf = h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
예제 #29
0
    def test_B_randomdata2_1_lineend(self):
        print "Using smalldata/datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" 
        # change lineend, case 1
        csvPathname1 = h2o.find_file('smalldata/datagen1.csv')
        csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv'
        infile = open(csvPathname1, 'r') 
        outfile = open(csvPathname2,'w') # existing file gets erased

        # assume all the test files are unix lineend. 
        # I guess there shouldn't be any "in-between" ones
        # okay if they change I guess.
        for line in infile.readlines():
            outfile.write(line.strip("\n") + "\r")
        infile.close()
        outfile.close()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname2, timeoutSecs=10, header=1, separator=44)
        h2o_cmd.runRFOnly(parseKey=parseKey, trees=1, response_variable=2, timeoutSecs=10, csvPathname=csvPathname2)
예제 #30
0
    def test_rf_covtype_fvec(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        rfViewInitial = []
        for jobDispatch in range(1):
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(jobDispatch)
            
            # don't poll for fvec 
            rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs)
            elapsed = time.time() - start
            print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            print h2o.dump_json(rfResult)
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = key2
            rfView['model_key'] = kwargs['model_key']
            rfView['ntree'] = kwargs['ntree']
            rfViewInitial.append(rfView)

            print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5)


        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntree = rfView['ntree']
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
예제 #31
0
    def test_RF_mnist_reals(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 trainCsvFilename,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseKey['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntree = 10
            params = {
                'response_variable':
                0,
                'ignore':
                ignore_x,
                'ntree':
                ntree,
                'iterative_cm':
                1,
                'out_of_bag_error_estimate':
                1,
                # 'data_key='mnist_reals_training.csv.hex'
                'features':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit':
                None,
                'depth':
                2147483647,
                'stat_type':
                'ENTROPY',
                'sampling_strategy':
                'RANDOM',
                'sample':
                67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key':
                'RF_model',
                'bin_limit':
                1024,
                'seed':
                784834182943470027,
                'parallel':
                1,
                'use_non_local_data':
                0,
                'class_weights':
                '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       rfView=True,
                                       timeoutSecs=timeoutSecs,
                                       pollTimeoutsecs=60,
                                       retryDelaySecs=2,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2,
                                       model_key=modelKey,
                                       ntree=ntree,
                                       out_of_bag_error_estimate=0,
                                       timeoutSecs=60,
                                       pollTimeoutSecs=60,
                                       noSimpleCheck=False,
                                       **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            self.assertAlmostEqual(
                classification_error,
                0.03,
                delta=0.5,
                msg="Classification error %s differs too much" %
                classification_error)
            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
예제 #32
0
    def test_RF_mnist_both(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed,
             parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 parsePattern,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseKey['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntree = 10
            params = {
                'response_variable':
                0,
                'ignore':
                ignore_x,
                'ntree':
                ntree,
                'iterative_cm':
                1,
                'out_of_bag_error_estimate':
                1,
                # 'data_key='mnist_training.csv.hex'
                'features':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit':
                None,
                'depth':
                2147483647,
                'stat_type':
                'ENTROPY',
                'sampling_strategy':
                'RANDOM',
                'sample':
                67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key':
                'RF_model',
                'bin_limit':
                1024,
                # 'seed': 784834182943470027,
                'parallel':
                1,
                'use_non_local_data':
                0,
                'class_weights':
                '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            if rfSeed is None:
                params['seed'] = random.randint(0, sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       rfView=False,
                                       timeoutSecs=timeoutSecs,
                                       pollTimeoutsecs=60,
                                       retryDelaySecs=2,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2,
                                       model_key=modelKey,
                                       ntree=ntree,
                                       out_of_bag_error_estimate=0,
                                       timeoutSecs=60,
                                       pollTimeoutSecs=60,
                                       noSimpleCheck=False,
                                       **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            print "classification error is expected to be low because we included the test data in with the training!"
            self.assertAlmostEqual(
                classification_error,
                0.0004,
                delta=0.0003,
                msg="Classification error %s differs too much" %
                classification_error)

            leaves = rfView['trees']['leaves']
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100
                d = "seed: %s leaves %s %s %s pct. different %s" % (
                    params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = rfView['trees']['depth']
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100
                d = "seed: %s depth %s %s %s pct. different %s" % (
                    params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"

        for d in allDelta:
            print d
예제 #33
0
    def test_rf_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # summary fails with 100000 cols
            (10, 50, 5000, 'cA', 600),
            (50, 50, 5000, 'cB', 600),
            (100, 50, 5000, 'cC', 600),
            (500, 50, 5000, 'cD', 600),
            (1000, 50, 5000, 'cE', 600),
            (5000, 50, 5000, 'cF', 600),
            (6000, 50, 5000, 'cF', 600),
            (7000, 50, 5000, 'cF', 600),
            (8000, 50, 5000, 'cF', 600),
            (9000, 50, 5000, 'cF', 600),
            (10000, 50, 5000, 'cF', 600),
        ]

        ### h2b.browseTheCloud()

        paramDict = {
            'class_weight': None,
            'ntree': 10,
            'model_key': 'model_keyA',
            'out_of_bag_error_estimate': 1,
            'stat_type': 'GINI',
            'depth': 2147483647,
            'bin_limit': 10000,
            'parallel': 1,
            'sample': 80,
            'exclusive_split_limit': 0,
        }

        trial = 0
        for (FILEREPL, rowCount, colCount, key2, timeoutSecs) in tryList:
            trial += 1

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseKey = make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename,
                                                key2, rowCount, colCount,
                                                FILEREPL, SEEDPERFILE,
                                                timeoutSecs)

            paramDict['response_variable'] = colCount - 1
            paramDict['features'] = 9
            paramDict['seed'] = random.randint(0, sys.maxint)
            kwargs = paramDict.copy()

            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF end on ", parseKey['python_source_key'], 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix'][
                'classification_error']
            ### self.assertLess(classification_error, 0.7, "Should have full classification error <0.7")

            algo = "RF "
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs. trees: {:d} Error: {:6.2f} \
                num_rows: {:d} num_cols: {:d} value_size_bytes: {:d}'                                                                     .format(
                len(h2o.nodes), tryHeap, algo, parseKey['python_source_key'], elapsed, kwargs['ntree'], \
                classification_error, parseKey['num_rows'], parseKey['num_cols'], parseKey['value_size_bytes'])
            print l
            h2o.cloudPerfH2O.message(l)

            print "Trial #", trial, "completed"
예제 #34
0
    def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between 
        # shuffled and non-shuffled datasets
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on", csvFilename
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=180)

        inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = num_rows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        # start at 90% rows + 1
        execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class!
        for trial in range(8,9):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r_" + csvFilename + "_" + str(trial)
            execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")"
            h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
            # hack so the RF will use the sliced result
            # FIX! don't use the sliced bit..use the whole data for rf training below
            ### parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial)
            # kwargs['model_key'] = "model"
            # double check the rows/cols
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, "going into RF")
            
            start = time.time()
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)
            


            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)

            kwargs['iterative_cm'] = 1
            kwargs['no_confusion_matrix'] = 0

            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            # double check the rows/cols
            inspect = h2o_cmd.runInspect(key=dataKeyTest)
            h2o_cmd.infoFromInspect(inspect, "dataKeyTest")

            rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            fullScorePctRight = 100 * (1.0 - rfvScoring['confusion_matrix']['classification_error'])

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        # return the last rfv done during training
        return rfv
예제 #35
0
    def test_rf_big1_nopoll(self):
        csvFilename = 'hhp_107_01.data.gz'
        csvPathname = h2o.find_file("smalldata/" + csvFilename)
        key2 = csvFilename + ".hex"

        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=key2,
                                     timeoutSecs=15)
        firstRfView = None
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete nopoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)

            print "Change the number of trees, while keeping the rf model key name the same"
            print "Checks that we correctly overwrite previous rf model"
            if OVERWRITE_RF_MODEL:
                kwargs['ntree'] = 7 + jobDispatch
            else:
                kwargs['ntree'] = 7
                # don't change the seed if we're overwriting the model. It should get
                # different results just from changing the tree count
                kwargs['seed'] = random.randint(0, sys.maxint)

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runRFOnly(node=randomNode,
                              parseKey=parseKey,
                              model_key=model_key,
                              timeoutSecs=300,
                              noPoll=True,
                              **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = key2
            rfView['model_key'] = model_key
            rfView['ntree'] = kwargs['ntree']

            print "rf job dispatch end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            # we're going to compare rf results to previous as we go along (so we save rf view results
            h2o_jobs.pollWaitJobs(pattern='RF_model',
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)

            # In this test we're waiting after each one, so we can save the RFView results for comparison to future
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntree = rfView['ntree']
            # a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True)
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None,
                                             data_key,
                                             model_key,
                                             ntree=ntree,
                                             timeoutSecs=60,
                                             noPoll=False)
            if firstRfView is None:  # we'll use this to compare the others
                firstRfView = rfViewResult.copy()
                firstModelKey = model_key
                print "firstRfView", h2o.dump_json(firstRfView)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       firstRfView,
                                       vice_versa=True,
                                       with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)
                self.assertGreater(len(df.difference), 29,
                    msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                        (len(df.difference), h2o.dump_json(df.difference)))
예제 #36
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "TEST-poker1000.csv",
            "leads.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            # these can't RF ..output classes not integer?
            # "bestbuy_test.csv",
            # "bestbuy_train.csv",
            "covtype.data",
            "covtype.4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            # "prostate_2g.csv",
            # "prostate_long.csv.gz",
            "prostate_long_1G.csv",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            # "poker_c1s1_testing_refresh.csv",
            # "3G_poker_shuffle",
            # "billion_rows.csv.gz",
            # "poker-hand.1244M.shuffled311M.full.txt",
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename,
                                               path='/datasets',
                                               timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,
                                       parseKey=parseKey,
                                       timeoutSecs=2000)
            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush()
예제 #37
0
    def test_rfview_score(self):
        csvPathnameTrain = h2o.find_dataset(
            'UCI/UCI-large/covtype/covtype.data')
        print "Train with:", csvPathnameTrain
        parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain,
                                          key2="covtype.hex",
                                          timeoutSecs=15)
        dataKeyTrain = parseKeyTrain['destination_key']

        csvPathnameTest = h2o.find_dataset(
            'UCI/UCI-large/covtype/covtype.data')
        print "Test with:", csvPathnameTest
        parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain,
                                         key2="covtype.hex",
                                         timeoutSecs=15)
        dataKeyTest = parseKeyTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {
                'ntree': 13,
                'parallel': 1,
                'out_of_bag_error_estimate': 0
            }
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1
                                                       or 5)
            rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain,
                                    timeoutSecs=timeoutSecs,
                                    retryDelaySecs=1,
                                    **kwargs)

            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            # new web page for predict? throw it in here for now
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 0
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            print "Trial #", trial, "completed"
예제 #38
0
    def test_rf_covtype_train_oobe2(self):
        print "\nUse randomBitVector and filter to separate the dataset randomly"
        importFolderPath = "/home/0xdiag/datasets"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"

        print "\nUsing header=0 on the normal covtype.data"
        # don't import it, just so we don't have all the key names cluttering the view all
        # in the browser
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=key2,
                                     header=0,
                                     timeoutSecs=100)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # this was with 10 trees
        # expectTrainPctRightList = [0, 85.27, 88.45, 89.99, 91.11, 91.96, 92.51, 93.03, 93.45, 93.78]
        # expectScorePctRightList = [0, 89.10, 91,90, 93.26, 94.25, 94.74, 95.10, 95.42, 95.72, 95.92]

        # 0 isn't used
        expectTrainPctRightList = [
            0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79
        ]
        expectScorePctRightList = [
            0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78
        ]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        dataKeyTrain = "rTrain"
        # start at 90% rows + 1
        # randomBitVector(size,selected)
        # randomFilter(srcFrame,rows,seed)
        # filter(srcFrame,bitVect)

        # odd. output is byte, all other exec outputs are 8 byte? (at least the ones below?)
        execExpr = "rbv=randomBitVector(" + str(num_rows) + "," + str(
            last10) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey="rbv", timeoutSecs=10)

        # complement the bit vector
        execExpr = "not_rbv=colSwap(rbv,0,rbv[0]==0?1:0)"
        h2o_exec.exec_expr(None, execExpr, resultKey="not_rbv", timeoutSecs=10)

        execExpr = dataKeyTest + "=filter(" + key2 + ",rbv)"
        h2o_exec.exec_expr(None,
                           execExpr,
                           resultKey=dataKeyTest,
                           timeoutSecs=10)

        execExpr = dataKeyTrain + "=filter(" + key2 + ",not_rbv)"
        h2o_exec.exec_expr(None,
                           execExpr,
                           resultKey=dataKeyTrain,
                           timeoutSecs=10)

        ### time.sleep(3600)
        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]

        for trial in range(1, 10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r" + str(trial)
            execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(
                rowsToUse) + ")"
            execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)"
            h2o_exec.exec_expr(None,
                               execExpr,
                               resultKey=resultKey,
                               timeoutSecs=10)
            parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(trial)

            rfv = h2o_cmd.runRFOnly(parseKey=parseKey,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (
                1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]),
                delta=0.2)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            # no_confusion_matrix=1&
            # clear_confusion_matrix=1
            ### dataKeyTest = data_key
            kwargs['clear_confusion_matrix'] = 1
            kwargs['no_confusion_matrix'] = 0
            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            rfv = h2o_cmd.runRFView(None,
                                    dataKeyTest,
                                    model_key,
                                    ntree,
                                    timeoutSecs,
                                    retryDelaySecs=1,
                                    print_params=True,
                                    **kwargs)

            fullScorePctRight = 100 * (
                1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), fullScorePctRight, expectScorePctRightList[trial]),
                delta=0.2)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / num_rows), "pct. of all rows"

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectTrainPctRightList, actualTrainPctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "expectTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectScorePctRightList, actualScorePctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "expectScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp
예제 #39
0
    def test_rf_covtype_train_oobe3(self):
        print "\nUse randomFilter to sample the dataset randomly. then slice it"
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"

        h2i.setupImportFolder(None, importFolderPath)
        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=100)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = num_rows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        dataKeyTrain = "rTrain"

        # FIX! too many digits (10) in the 2nd param seems to cause stack trace
        execExpr = dataKeyTest + "=randomFilter(" + key2 + "," + str(pct10) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)

        execExpr = dataKeyTrain + "=randomFilter(" + key2 + "," + str(rowsForPct[9]) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10)

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        for trial in range(1,10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r" + str(trial)
            execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")"
            # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)"
            h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
            parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(trial)
            
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)
            kwargs['iterative_cm'] = 1
            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp