예제 #1
0
    def test_rf_1ktrees_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [500]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
예제 #2
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
예제 #3
0
파일: test_10ktrees.py 프로젝트: nadya1/h2o
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in xrange (1,3,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            key = h2o.nodes[0].put_file(csvPathname)
            parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex")

            h2o.verboseprint("Trial", trial)
            start = time.time()
            cmd.runRFOnly(parseKey=parseKey, trees=10000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'
예제 #4
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl',
                                   shCmdString.split(),
                                   timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            h2o_cmd.runRF(trees=trees,
                          timeoutSecs=timeoutSecs,
                          csvPathname=csvPathname)
            trees += 10
예제 #5
0
파일: test_flatfile.py 프로젝트: segahm/h2o
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            # FIX! TBD do we always have to kick off the run from node 0?
            # what if we do another node?
            # FIX! do we need or want a random delay here?
            h2o_cmd.runRF(trees=trees,
                          timeoutSecs=timeoutSecs,
                          csvPathname=csvPathname)
            trees += 10
            sys.stdout.write('.')
            sys.stdout.flush()
예제 #6
0
    def test_rf3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in range (1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put', pollTimeoutSecs=60, timeoutSecs=60)

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, max_depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
예제 #7
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        trial = 1
        for x in xrange(1, 10, 1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseKey = h2o_cmd.parseFile(None, csvPathname)

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRFOnly(parseKey=parseKey,
                              trees=237,
                              depth=45,
                              timeoutSecs=120)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
예제 #8
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult,
                          ntrees=trees,
                          timeoutSecs=timeoutSecs)
            trees += 10
예제 #9
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            key2 = csvFilename + "_" + str(trial) + ".hex"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
예제 #10
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in xrange (1,10,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put')

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
예제 #11
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to 
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            # FIX! TBD do we always have to kick off the run from node 0?
            # what if we do another node?
            # FIX! do we need or want a random delay here?
            h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs,
                    csvPathname=csvPathname)
            trees += 10
            sys.stdout.write('.')
            sys.stdout.flush()
예제 #12
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = (
                "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR
            )
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 3):
            sys.stdout.write(".")
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30
            )

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3)
            print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds"

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
예제 #13
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(50, 200, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # bump this up too if you do?
        # always match the gen above!
        ### for x in xrange (50,200,10):
        for x in xrange(50, 200, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + "100" + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            h2o_cmd.runRF(csvPathname=csvPathname,
                          trees=100,
                          timeoutSecs=5,
                          retryDelaySecs=0.1)
    def test_1ktrees_job_cancel_many_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range (1,5):
            # random 0 or 1 delay
            delay = random.uniform(0,1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3)
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
예제 #15
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
    def test_1ktrees_job_cancel_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "Kick off twenty, then cancel them all..there's a timeout on the wait after cancelling"
        for trial in range (1,20):
            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, depth=50, rfView=False, noPoll=True,
                timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'


        h2o.check_sandbox_for_errors()
        h2o_jobs.cancelAllJobs(timeoutSecs=10)
예제 #17
0
파일: test_rf1.py 프로젝트: georgekola/h2o
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

# two row dataset gets this. Avoiding it for now
# java.lang.ArrayIndexOutOfBoundsException: 1
# at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange (10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange (10,100,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
예제 #18
0
    def test_GenParity1(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

        # two row dataset gets this. Avoiding it for now
        # java.lang.ArrayIndexOutOfBoundsException: 1
        # at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange(10, 100, 10):
            shCmdString = "perl " + parityPl + " 128 4 " + str(
                x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange(10, 100, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trees,
                          timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
    def test_rf_1ktrees_job_cancel_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            rfv = h2o_cmd.runRF(parseResult=parseResult,
                                trees=1000,
                                max_depth=2,
                                rfView=False,
                                timeoutSecs=600,
                                retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            print "model_key:", model_key

            # FIX! need to get more intelligent here
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # this is the wrong key to ancel with
            # "destination_key": "pytest_model",
            print "cancelling with a bad key"
            b = h2o.nodes[0].jobs_cancel(key=model_key)
            print "jobs_cancel():", h2o.dump_json(b)
예제 #20
0
    def test_B_GenParity1(self):
        # Create a directory for the created dataset files. ok if already exists
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR,
        print "\nmay be a minute.........."
        for x in xrange(161, 240, 20):
            # more rows!
            y = 10000 * x
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            # UPDATE: maybe EC2 takes a long time to spawn a process?
            h2o.spawn_cmd_and_wait('parity.pl',
                                   shCmdString.split(),
                                   timeout=90)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            sys.stdout.write('.')
            sys.stdout.flush()
        print "\nDatasets generated. Using."

        # always match the gen above!
        # Let's try it twice!
        for trials in xrange(1, 7):
            # prime
            trees = 6

            for x in xrange(161, 240, 20):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                csvFilename = "parity_128_4_" + str(y) + "_quad.data"
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                # FIX! TBD do we always have to kick off the run from node 0?
                # random guess about length of time, varying with more hosts/nodes?
                timeoutSecs = 20 + 5 * (len(h2o.nodes))

                # change the model name each iteration, so they stay in h2o
                model_key = csvFilename + "_" + str(trials)
                h2o_cmd.runRF(trees=trees,
                              model_key=model_key,
                              timeoutSecs=timeoutSecs,
                              retryDelaySecs=1,
                              csvPathname=csvPathname)
                sys.stdout.write('.')
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
    def test_1ktrees_job_cancel_many_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname,
                                                             schema='put',
                                                             hex_key=hex_key,
                                                             timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range(1, 50):
            # random 0 or 1 delay
            delay = random.uniform(0, 1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trial,
                          max_depth=50,
                          rfView=False,
                          noPoll=True,
                          timeoutSecs=30,
                          retryDelaySecs=0.25)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult,
                               trees=trial,
                               max_depth=50,
                               rfView=False,
                               noPoll=False,
                               timeoutSecs=600,
                               retryDelaySecs=3)
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
예제 #22
0
    def test_B_GenParity1(self):
        # Create a directory for the created dataset files. ok if already exists
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR,
        print "\nmay be a minute.........."
        for x in xrange(161, 240, 20):
            # more rows!
            y = 10000 * x
            # Have to split the string out to list for pipe
            shCmdString = (
                "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad " + SYNDATASETS_DIR
            )
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            # UPDATE: maybe EC2 takes a long time to spawn a process?
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), timeout=90)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            sys.stdout.write(".")
            sys.stdout.flush()
        print "\nDatasets generated. Using."

        # always match the gen above!
        # Let's try it twice!
        for trials in xrange(1, 7):
            # prime
            trees = 6

            for x in xrange(161, 240, 20):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                csvFilename = "parity_128_4_" + str(y) + "_quad.data"
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                # FIX! TBD do we always have to kick off the run from node 0?
                # random guess about length of time, varying with more hosts/nodes?
                timeoutSecs = 20 + 5 * (len(h2o.nodes))

                # change the model name each iteration, so they stay in h2o
                model_key = csvFilename + "_" + str(trials)
                parseResult = h2i.import_parse(path=csvPathname, schema="put")
                h2o_cmd.runRF(
                    parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1
                )
                sys.stdout.write(".")
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
    def test_rf_1ktrees_job_cancel_3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 20):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trial,
                          max_depth=2,
                          rfView=False,
                          timeoutSecs=600,
                          retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

            # FIX! need to get more intelligent here
            time.sleep(1)
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # "destination_key": "pytest_model",
            # FIX! using 'key': 'pytest_model" with no time delay causes a failure
            time.sleep(1)
            jobsList = a['jobs']
            for j in jobsList:
                b = h2o.nodes[0].jobs_cancel(key=j['key'])
                print "jobs_cancel():", h2o.dump_json(b)
    def test_rf_1ktrees_job_cancel_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()


        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()

            # without rfview, do we get the 'first" rf json?
            rfv = h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, rfView=False,
                timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            # rf_model = rfv['drf_model']
            # data_key = rf_model['_dataKey']
            # model_key = rf_model['_key']
            data_key = rfv['source']['_key']
            model_key = rfv['destination_key']

            print "model_key:", model_key

            # FIX! need to get more intelligent here
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # this is the wrong key to ancel with
            # "destination_key": "pytest_model", 
            print "cancelling with a bad key"
            b = h2o.nodes[0].jobs_cancel(key=model_key)
            print "jobs_cancel():", h2o.dump_json(b)
예제 #25
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in xrange(1, 20, 1):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            key = h2o.nodes[0].put_file(csvPathname)
            parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex")

            h2o.verboseprint("Trial", trial)
            start = time.time()
            # rfview=False used to inhibit the rfview completion
            h2o_cmd.runRFOnly(parseKey=parseKey,
                              trees=trial,
                              depth=2,
                              rfview=False,
                              timeoutSecs=600,
                              retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

            # FIX! need to get more intelligent here
            time.sleep(1)
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # "destination_key": "pytest_model",
            # FIX! using 'key': 'pytest_model" with no time delay causes a failure
            time.sleep(1)
            jobsList = a['jobs']
            for j in jobsList:
                b = h2o.nodes[0].jobs_cancel(key=j['key'])
                print "jobs_cancel():", h2o.dump_json(b)
예제 #26
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            key2 = csvFilename + "_" + str(trial) + ".hex"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=key2,
                                         timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            rfResult = h2o_cmd.runRFOnly(parseKey=parseKey,
                                         trees=1000,
                                         depth=2,
                                         rfView=False,
                                         timeoutSecs=600,
                                         retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            model_key = rfResult['model_key']
            print "model_key:", model_key

            # FIX! need to get more intelligent here
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # this is the wrong key to ancel with
            # "destination_key": "pytest_model",
            print "cancelling with a bad key"
            b = h2o.nodes[0].jobs_cancel(key=model_key)
            print "jobs_cancel():", h2o.dump_json(b)
예제 #27
0
    def test_D_GenParity1(self):
        # Create a directory for the created dataset files. ok if already exists
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR,
        print "\nmay be a minute.........."
        for x in xrange (161,240,20):
            # more rows!
            y = 10000 * x
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(y) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to 
            # wait for the last one to be gen'ed here before we start the first below.
            # large row counts. need more time
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=90)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            sys.stdout.write('.')
            sys.stdout.flush()
        print "\nDatasets generated. Using."

        # always match the gen above!
        # Let's try it twice!
        for trials in xrange(1,7):
            # prime
            trees = 4057
            trees = 6

            for x in xrange (161,240,20):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                csvFilename = "parity_128_4_" + str(y) + "_quad.data"  
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                # FIX! TBD do we always have to kick off the run from node 0?
                # random guess about length of time, varying with more hosts/nodes?
                timeoutSecs = 30 + trees*(len(h2o.nodes))

                # change the model name each iteration, so they stay in h2o
                model_key = csvFilename + "_" + str(trials)
                h2o_cmd.runRF(trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, 
                    retryDelaySecs=1, csvPathname=csvPathname)
                sys.stdout.write('.')
                sys.stdout.flush()
예제 #28
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname)
            trees += 10
예제 #29
0
    def test_rf_10ktrees_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 3):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=10000,
                          max_depth=2,
                          timeoutSecs=900,
                          retryDelaySecs=3)
            print "RF #", trial, "end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
    def test_rf_1ktrees_job_cancel_3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,20):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=2, rfView=False,
                timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'

            # FIX! need to get more intelligent here
            time.sleep(1)
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # "destination_key": "pytest_model", 
            # FIX! using 'key': 'pytest_model" with no time delay causes a failure
            time.sleep(1)
            jobsList = a['jobs']
            for j in jobsList:
                b = h2o.nodes[0].jobs_cancel(key=j['key'])
                print "jobs_cancel():", h2o.dump_json(b)
예제 #31
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(50, 200, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # bump this up too if you do?
        # always match the gen above!
        ### for x in xrange (50,200,10):
        for x in xrange(50, 200, 10):
            sys.stdout.write(".")
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + "100" + "_quad.data"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            h2o_cmd.runRF(csvPathname=csvPathname, trees=100, timeoutSecs=5, retryDelaySecs=0.1)
예제 #32
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to 
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)
            trees += 10
예제 #33
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            key2 = csvFilename + "_" + str(trial) + ".hex"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, rfView=False,
                timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            model_key = rfResult['model_key']
            print "model_key:", model_key

            # FIX! need to get more intelligent here
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # this is the wrong key to ancel with
            # "destination_key": "pytest_model", 
            print "cancelling with a bad key"
            b = h2o.nodes[0].jobs_cancel(key=model_key)
            print "jobs_cancel():", h2o.dump_json(b)
예제 #34
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in xrange(1, 5, 1):
            sys.stdout.write(".")
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            key = h2o.nodes[0].put_file(csvPathname)
            parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex")

            h2o.verboseprint("Trial", trial)
            start = time.time()
            # rfview=False used to inhibit the rfview completion
            h2o_cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, rfview=False, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, "took", time.time() - start, "seconds"

            # FIX! need to get more intelligent here
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # this is the wrong key to ancel with
            # "destination_key": "pytest_model",
            print "cancelling with a bad key"
            b = h2o.nodes[0].jobs_cancel(key="pytest_model")
            print "jobs_cancel():", h2o.dump_json(b)
예제 #35
0
    def test_rf_parity_cmp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [50000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   ntrees=ntrees,
                                   max_depth=40,
                                   response=response,
                                   timeoutSecs=600,
                                   retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        ntrees=ntrees,
                                        max_depth=40,
                                        response=response,
                                        timeoutSecs=600,
                                        retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,
                    k) in enumerate(zip(classErrorPctList1,
                                        classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i],
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(
                totalError1,
                totalError2,
                delta=.2 * totalError2,
                msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(
                elapsed1,
                elapsed2,
                delta=.5 * elapsed2,
                msg="Comparing RF times for DRF2 and SpeeDRF")

        # always match the gen above!
        for trial in range(1):
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=30,
                doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            response = "C" + str(numCols)
            ntrees = 30

            doBoth()
            print "*****************************"
            print "end # %s RF compare" % trial,
            print "*****************************"

            print "Now change all cols to enums"
            for e in range(numCols):
                enumResult = h2o.nodes[0].to_enum(src_key=hex_key,
                                                  column_index=(e + 1))

            doBoth()
            print "*********************************"
            print "end # %s RF compare, with enums #" % trial,
            print "*********************************"
예제 #36
0
    def test_rf_parity_cmp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [50000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], 
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")

        # always match the gen above!
        for trial in range (1):
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            response = "C" + str(numCols)
            ntrees = 30

            doBoth()
            print "*****************************"
            print "end # %s RF compare" % trial, 
            print "*****************************"

            print "Now change all cols to enums"
            for e in range(numCols):
                enumResult = h2o.nodes[0].to_enum(src_key=hex_key, column_index=(e+1))


            doBoth()
            print "*********************************"
            print "end # %s RF compare, with enums #" % trial, 
            print "*********************************"