def test_rf_params_rand2(self):
     csvPathname = 'space_shuttle_damage.csv'
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'stat_type': 'ENTROPY',
             'class_weights': 'yes=1000',
             'ntree': 50, 
             'response_variable': 'damage', 
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params 
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params 
         kwargs = params.copy()
         timeoutSecs = 180
         start = time.time()
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
         rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         # just to get the list of per class errors
         (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
         # why does this vary between 22 and 23
         self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes'
         self.assertLess(classErrorPctList[0],95) # class 0 is 'no'
         self.assertLess(classErrorPctList[1],29) # class 1 is 'yes'
         self.assertLess(classification_error,61)
示例#2
0
    def test_rf_params_rand2_fvec(self):
        h2o.beta_features = True
        csvPathname = "standard/covtype.data"
        hex_key = "covtype.data.hex"
        for trial in range(10):
            # params is mutable. This is default.
            params = {"ntrees": 13, "mtries": 7}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            if "cols" in params and params["cols"]:
                pass
            else:
                if "ignored_cols_by_name" in params and params["ignored_cols_by_name"]:
                    params["mtries"] = random.randint(1, 53)
                else:
                    params["mtries"] = random.randint(1, 54)

            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + ((kwargs["ntrees"] * 80) * max(1, kwargs["mtries"] / 60))
            start = time.time()
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=hex_key
            )
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
    def test_rf_params_rand2_ncaa(self):
        csvPathname = 'ncaa/Players.csv'
        for trial in range(4):
            # params is mutable. This is default.
            params = {'ntree': 13, 'features': 4}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + (
                (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15))

            # hack to NA the header (duplicate header names)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='put',
                                           header=0)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          timeoutSecs=timeoutSecs,
                          retryDelaySecs=1,
                          **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)
示例#4
0
 def test_rf_params_rand2(self):
     csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv')
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'stat_type': 'ENTROPY',
             'class_weights': 'yes=1000',
             'ntree': 50, 
             'parallel': 1, 
             'response_variable': 'damage', 
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params 
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params 
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         # seems ec2 can be really slow
         timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10)
         start = time.time()
         rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         # just to get the list of per class errors
         (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
         # why does this vary between 22 and 23
         self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes'
         self.assertLess(classErrorPctList[0],95) # class 0 is 'no'
         self.assertLess(classErrorPctList[1],29) # class 1 is 'yes'
         self.assertLess(classification_error,61)
示例#5
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED =
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + (
             (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) *
             (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs,
                       retryDelaySecs=1,
                       csvPathname=csvPathname,
                       **kwargs)
         elapsed = time.time() - start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
             (elapsed * 100) / timeoutSecs)
 def test_rf_params_rand2_7066883810153380318(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 23, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
示例#7
0
 def test_loop_random_param_covtype(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
示例#8
0
    def test_loop_random_param_poker1000(self):
        csvPathname = h2o.find_file('smalldata/poker/poker1000')
        for trial in range(20):
            # params is mutable. This is default.
            params = {'ntree': 19, 'parallel': 1}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5)

            h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
            print "Trial #", trial, "completed"
 def test_loop_random_param_covtype(self):
     csvPathname = 'UCI/UCI-large/covtype/covtype.data'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put')
         h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
示例#10
0
 def test_loop_random_param_poker1000(self):
     csvPathname = 'poker/poker1000'
     for trial in range(20):
         # params is mutable. This is default.
         params = {'ntree': 19, 'parallel': 1}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5)
         h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs, **kwargs)
         h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
示例#11
0
 def test_rf_params_rand2(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
示例#12
0
 def test_rf_params_rand1(self):
     csvPathname = "poker/poker1000"
     for trial in range(10):
         # params is mutable. This is default.
         params = {"ntree": 63, "use_non_local_data": 1}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         print kwargs
         # slower if parallel=0
         timeoutSecs = 30 + kwargs["ntree"] * 6
         parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put", timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
示例#13
0
 def test_rf_params_rand1_fvec(self):
     csvPathname = 'poker/poker1000'
     params = {'ntrees': 2}
     for trial in range(10):
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         print kwargs
         # slower if parallel=0
         timeoutSecs = 30 + kwargs['ntrees'] * 6
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key='poker1000.hex', schema='put', 
             timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
示例#14
0
 def test_rf_params_rand1(self):
     csvPathname = 'poker/poker1000'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 63, 'parallel': 1, 'use_non_local_data': 1}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         print kwargs
         # slower if parallel=0
         timeoutSecs = 30 + kwargs['ntree'] * 6 * (kwargs['parallel'] and 1 or 5)
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
 def test_rf_params_rand2(self):
     csvPathname = 'standard/covtype.data'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         start = time.time()
         parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
示例#16
0
    def test_rf_params_rand2(self):
        csvPathname = h2o.find_file('/home/0xdiag/datasets/ncaa/Players.csv')
        for trial in range(10):
            # params is mutable. This is default.
            params = {'ntree': 13, 'parallel': 1, 'features': 4}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))

            start = time.time()
            h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
            elapsed = time.time()-start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
示例#17
0
 def test_rf_params_rand1_fvec(self):
     h2o.beta_features = True
     csvPathname = 'poker/poker1000'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntrees': 63}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         print kwargs
         # slower if parallel=0
         timeoutSecs = 30 + kwargs['ntrees'] * 30
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
示例#18
0
 def test_rf_params_rand1_fvec(self):
     h2o.beta_features = True
     csvPathname = 'poker/poker1000'
     params = {'ntrees': 2}
     for trial in range(10):
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         print kwargs
         # slower if parallel=0
         timeoutSecs = 30 + kwargs['ntrees'] * 6
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key='poker1000.hex', schema='put', 
             timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "Trial #", trial, "completed"
 def test_loop_random_param_covtype(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     # for determinism, I guess we should spit out the seed?
     ##### SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     SEED = 4201285065147091758
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
 def test_loop_random_param_poker1000(self):
     csvPathname = 'poker/poker1000'
     for trial in range(20):
         # params is mutable. This is default.
         params = {'ntree': 19}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + kwargs['ntree'] * 10
         parseResult = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname,
                                        schema='put',
                                        timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult,
                       timeoutSecs=timeoutSecs,
                       **kwargs)
         print "Trial #", trial, "completed"
    def test_rf_params_rand2_ncaa(self):
        csvPathname = 'ncaa/Players.csv'
        for trial in range(4):
            # params is mutable. This is default.
            params = {'ntree': 13, 'features': 4}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15))

            # hack to NA the header (duplicate header names)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', header=0)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
            elapsed = time.time()-start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
示例#22
0
    def test_rf_params_rand2(self):
        csvPathname = "ncaa/Players.csv"
        for trial in range(4):
            # params is mutable. This is default.
            params = {"ntree": 13, "features": 4}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + ((kwargs["ntree"] * 20) * max(1, kwargs["features"] / 15))

            parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, schema="put")
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
示例#23
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED = 
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(20):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         # seems ec2 can be really slow
         timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10)
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
    def test_rf_params_rand2_7066883810153380318(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        # SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        SEED = 7066883810153380318
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        for trial in range(10):
            # params is mutable. This is default.
            params = {'ntree': 23, 'parallel': 1, 'features': 7}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
            h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
            print "Trial #", trial, "completed"
    def test_rf_params_rand2_7066883810153380318(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        # SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        SEED = 7066883810153380318
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            params = {'ntree': 23, 'parallel': 1}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + kwargs['ntree'] * 10 *  (kwargs['parallel'] and 1 or 3)
            h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
            print "Trial #", trial, "completed"
 def test_rf_params_rand1(self):
     csvPathname = 'poker/poker1000'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 63, 'use_non_local_data': 1}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         print kwargs
         # slower if parallel=0
         timeoutSecs = 30 + kwargs['ntree'] * 6
         parseResult = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname,
                                        schema='put',
                                        timeoutSecs=timeoutSecs)
         h2o_cmd.runRF(parseResult=parseResult,
                       timeoutSecs=timeoutSecs,
                       **kwargs)
         print "Trial #", trial, "completed"
示例#27
0
    def test_loop_random_param_poker1000(self):
        csvPathname = h2o.find_file('smalldata/poker/poker1000')
        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            params = {'ntree': 19, 'parallel': 1}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5)

            h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
            print "Trial #", trial, "completed"
示例#28
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED = 
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
示例#29
0
 def test_rf_params_rand2(self):
     csvPathname = 'space_shuttle_damage.csv'
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'stat_type': 'ENTROPY',
             'class_weights': 'yes=1000',
             'ntree': 50,
             'response_variable': 'damage',
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params
         kwargs = params.copy()
         timeoutSecs = 180
         start = time.time()
         parseResult = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname,
                                        schema='put')
         rfView = h2o_cmd.runRF(parseResult=parseResult,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=1,
                                **kwargs)
         elapsed = time.time() - start
         # just to get the list of per class errors
         (classification_error, classErrorPctList,
          totalScores) = h2o_rf.simpleCheckRFView(None,
                                                  rfView,
                                                  noPrint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
             (elapsed * 100) / timeoutSecs), "\n"
         # why does this vary between 22 and 23
         self.assertAlmostEqual(totalScores, 23,
                                delta=1)  # class 1 is 'yes'
         self.assertLess(classErrorPctList[0], 95)  # class 0 is 'no'
         self.assertLess(classErrorPctList[1], 29)  # class 1 is 'yes'
         self.assertLess(classification_error, 61)
示例#30
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED = 
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv')
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'gini': 0,
             'class_weights': 'yes=1000',
             'ntree': 50, 
             'parallel': 1, 
             'response_variable': 'damage', 
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params 
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params 
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         # seems ec2 can be really slow
         timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10)
         start = time.time()
         rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         # just to get the list of per class errors
         (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noprint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
         self.assertEqual(totalScores,23) # class 1 is 'yes'
         self.assertLess(classErrorPctList[0],82) # class 0 is 'no'
         self.assertLess(classErrorPctList[1],29) # class 1 is 'yes'
         self.assertLess(classification_error,61)
示例#31
0
    def test_loop_random_param_poker1000(self):
        csvPathname = h2o.find_file('smalldata/poker/poker1000')
        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            params = {'ntree': 19, 'parallel': 1}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1
                                                       or 5)

            h2o_cmd.runRF(timeoutSecs=timeoutSecs,
                          csvPathname=csvPathname,
                          **kwargs)
            print "Trial #", trial, "completed"
示例#32
0
    def test_rf_params_rand2_fvec(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        hex_key = 'covtype.data.hex'
        for trial in range(2):
            # params is mutable. This is default.
            params = {
                'ntrees': 13,
                'mtries': 7,
                'balance_classes': 0,
                'importance': 0
            }
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            if 'cols' in params and params['cols']:
                pass
            else:
                if 'ignored_cols_by_name' in params and params[
                        'ignored_cols_by_name']:
                    params['mtries'] = random.randint(1, 53)
                else:
                    params['mtries'] = random.randint(1, 54)

            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + (
                (kwargs['ntrees'] * 80) * max(1, kwargs['mtries'] / 60))
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            h2o_cmd.runRF(parseResult=parseResult,
                          timeoutSecs=timeoutSecs,
                          retryDelaySecs=1,
                          **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)
示例#33
0
 def test_rf_params_rand2(self):
     h2o.beta_features = True
     csvPathname = 'standard/covtype.data'
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntrees': 13, 'mtries': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         if 'cols' in params and params['cols']:
             pass
         else:
             if 'ignored_cols_by_name' in params and params['ignored_cols_by_name']:
                 params['mtries'] = random.randint(1,53)
             else:
                 params['mtries'] = random.randint(1,54)
             
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntrees']*80) * max(1,kwargs['mtries']/60) )
         start = time.time()
         parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
         h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
示例#34
0
    def test_rfview_score(self):
        csvPathnameTrain = 'UCI/UCI-large/covtype/covtype.data'
        print "Train with:", csvPathnameTrain
        parseResultTrain = h2i.import_parse(bucket='datasets', path=csvPathnameTrain, schema='put', 
            hex_key="covtype.hex", timeoutSecs=15)
        dataKeyTrain = parseResultTrain['destination_key']

        csvPathnameTest = 'UCI/UCI-large/covtype/covtype.data'
        print "Test with:", csvPathnameTest
        parseResultTest = h2i.import_parse(bucket='datasets', path=csvPathnameTest, schema='put', 
            hex_key="covtype.hex", timeoutSecs=15)
        dataKeyTest = parseResultTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5)
            rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # new web page for predict? throw it in here for now

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if 'sampling_strategy' in kwargs and kwargs['sampling_strategy'] != 'STRATIFIED_LOCAL':
                check_err = True
            else:
                check_err = False

            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 0
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            print "Trial #", trial, "completed"
示例#35
0
    def test_rf_covtype20x(self):
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        hex_key = 'covtype20x.data.A.hex'
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print csvFilenameTrain, 'parse time:', parseResultTrain['response']['time']
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        hex_key = 'covtype20x.data.B.hex'
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print csvFilenameTest, 'parse time:', parseResultTest['response']['time']
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        dataKeyTest2 = 'covtype20x.data.C.hex'

        print "Parse end", dataKeyTest
        
        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=15)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?"
        params = {
            'ntree': 6, 
            'parallel': 1, 
            'out_of_bag_error_estimate': 0, 
# Causes rest api illegal argument error.
#            'no_confusion_matrix': 1,
            'model_key': 'RF_model'
        }

        colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow
        timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5)

        start = time.time()
        rfv = h2o_cmd.runRF(parseResult=parseResultTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)


        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=500, retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['model_key']
        ntree = kwargs['ntree']
        start = time.time()
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None, dataKeyTest, 
                model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            # FIX! should update this expected classification error
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            print "Trial #", trial, "completed"
示例#36
0
    def test_rfview_score(self):
        csvPathnameTrain = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        print "Train with:", csvPathnameTrain
        parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15)
        dataKeyTrain = parseKeyTrain['destination_key']

        csvPathnameTest = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        print "Test with:", csvPathnameTest
        parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15)
        dataKeyTest = parseKeyTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5)
            rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
    
            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # new web page for predict? throw it in here for now
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 0
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            print "Trial #", trial, "completed"
示例#37
0
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        kwargs = {'str': execExpr, 'timeoutSecs': 15}
        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        paramDict = drf2ParamDict
        params = {'ntrees': 20, 'destination_key': 'RF_model'}

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        timeoutSecs = 30 + kwargs['ntrees'] * 60

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs,
                           retryDelaySecs=1,
                           **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntree = kwargs['ntrees']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None,
                          dataKeyTrain,
                          model_key,
                          ntree=ntree,
                          timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree=ntree,
                                       timeoutSecs=timeoutSecs,
                                       out_of_bag_error_estimate=0,
                                       retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(
                classification_error,
                50,
                delta=50,
                msg="Classification error %s differs too much" %
                classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                               model_key=rfModelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C55',
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
示例#38
0
    def test_rf_change_data_key(self):
        importFolderPath = '/home/0xdiag/datasets/standard'
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)

        csvFilenameTrain = 'covtype.data'
        parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, timeoutSecs=500)
        print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time']
        inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key'])
        dataKeyTrain = parseKeyTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseKey = parseKey
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, timeoutSecs=500)
        print csvFilenameTest, 'parse time:', parseKeyTest['response']['time']
        print "Parse result['destination_key']:", parseKeyTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key'])
        dataKeyTest = parseKeyTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?"
        params = {
            'ntree': 6, 
            'parallel': 1, 
            'out_of_bag_error_estimate': 0, 
            'no_confusion_matrix': 1,
            'model_key': 'RF_model'
        }

        colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow
        timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5)

        start = time.time()
        rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)


        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['model_key']
        ntree = kwargs['ntree']
        start = time.time()
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            print "Trial #", trial, "completed"
示例#39
0
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        if h2o.beta_features:
            kwargs = {'str': execExpr, 'timeoutSecs': 15}
        else:
            kwargs = {'expression': execExpr, 'timeoutSecs': 15}

        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        if h2o.beta_features:
            paramDict = drf2ParamDict
            params = {
                'ntrees': 20, 
                'destination_key': 'RF_model'
            }
        else:
            paramDict = drf1ParamDict
            params = {
                'ntree': 20, 
                'out_of_bag_error_estimate': 1, 
                'model_key': 'RF_model'
            }

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        if h2o.beta_features:
            timeoutSecs = 30 + kwargs['ntrees'] * 60
        else:
            timeoutSecs = 30 + kwargs['ntree'] * 60 

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        if h2o.beta_features:
            model_key = kwargs['destination_key']
            ntree = kwargs['ntrees']
        else:
            model_key = kwargs['model_key']
            ntree = kwargs['ntree']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None, dataKeyTest, 
                model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(classification_error, 50, delta=50, 
                msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey  = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=parseKey,
                model_key=rfModelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C54',
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
示例#40
0
    def test_rfview_score(self):
        csvPathnameTrain = h2o.find_dataset(
            'UCI/UCI-large/covtype/covtype.data')
        print "Train with:", csvPathnameTrain
        parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain,
                                          key2="covtype.hex",
                                          timeoutSecs=15)
        dataKeyTrain = parseKeyTrain['destination_key']

        csvPathnameTest = h2o.find_dataset(
            'UCI/UCI-large/covtype/covtype.data')
        print "Test with:", csvPathnameTest
        parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain,
                                         key2="covtype.hex",
                                         timeoutSecs=15)
        dataKeyTest = parseKeyTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {
                'ntree': 13,
                'parallel': 1,
                'out_of_bag_error_estimate': 0
            }
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1
                                                       or 5)
            rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain,
                                    timeoutSecs=timeoutSecs,
                                    retryDelaySecs=1,
                                    **kwargs)

            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            # new web page for predict? throw it in here for now
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 0
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            print "Trial #", trial, "completed"
示例#41
0
    def test_rfview_score(self):
        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        csvPathnameTrain = h2o.find_file('smalldata/covtype/covtype.20k.data')
        print "Train with:", csvPathnameTrain
        parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain,
                                          key2="covtype.20k.hex",
                                          timeoutSecs=10)
        dataKeyTrain = parseKeyTrain['destination_key']

        csvPathnameTest = h2o.find_dataset(
            'UCI/UCI-large/covtype/covtype.data')
        print "Test with:", csvPathnameTest
        parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain,
                                         key2="covtype.hex",
                                         timeoutSecs=10)
        dataKeyTest = parseKeyTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {
                'ntree': 13,
                'parallel': 1,
                'out_of_bag_error_estimate': 0
            }
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10)
            rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain,
                                    timeoutSecs=timeoutSecs,
                                    retryDelaySecs=1,
                                    **kwargs)

            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            # no_confusion_matrix=1&
            # clear_confusion_matrix=1
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)

            kwargs['no_confusion_matrix'] = 0
            kwargs['clear_confusion_matrix'] = 0
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)

            kwargs['no_confusion_matrix'] = 0
            kwargs['clear_confusion_matrix'] = 1
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)

            kwargs['no_confusion_matrix'] = 1
            kwargs['clear_confusion_matrix'] = 0
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)

            kwargs['no_confusion_matrix'] = 1
            kwargs['clear_confusion_matrix'] = 1
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)

            kwargs['no_confusion_matrix'] = 0
            kwargs['clear_confusion_matrix'] = 0
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            h2o_cmd.runRFView(None,
                              dataKeyTest,
                              model_key,
                              ntree,
                              timeoutSecs,
                              retryDelaySecs=1,
                              print_params=True,
                              **kwargs)

            print "Trial #", trial, "completed"
    def test_rf_change_data_key_fvec(self):
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseResult = parseResult
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        params = {'ntrees': 2, 'destination_key': 'RF_model'}

        colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow

        timeoutSecs = 100
        start = time.time()
        rfv = h2o_cmd.runRF(parseResult=parseResultTrain,
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=1,
                            noPoll=True,
                            **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)

        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model',
                              timeoutSecs=360,
                              pollTimeoutSecs=120,
                              retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntrees = kwargs['ntrees']
        start = time.time()
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntrees, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntrees,
                                       timeoutSecs,
                                       out_of_bag_error_estimate=1,
                                       retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            # FIX! should update this expected classification error
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            print "Trial #", trial, "completed"
示例#43
0
    def test_rf_covtype20x(self):
        importFolderPath = '/home/0xdiag/datasets/standard'

        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        csvFilenameTrain = 'covtype20x.data'
        key2 = 'covtype20x.data.A.hex'
        parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, key2=key2, timeoutSecs=500)
        print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time']
        inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key'])
        dataKeyTrain = parseKeyTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        csvFilenameTest = 'covtype20x.data'
        key2 = 'covtype20x.data.B.hex'
        parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, key2=key2, timeoutSecs=500)
        print csvFilenameTest, 'parse time:', parseKeyTest['response']['time']
        print "Parse result['destination_key']:", parseKeyTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key'])
        dataKeyTest = parseKeyTest['destination_key']
        dataKeyTest2 = 'covtype20x.data.C.hex'

        print "Parse end", dataKeyTest
        
        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        resultExec = h2o_cmd.runExecOnly(expression=execExpr, timeoutSecs=15)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?"
        params = {
            'ntree': 6, 
            'parallel': 1, 
            'out_of_bag_error_estimate': 0, 
            'no_confusion_matrix': 1,
            'model_key': 'RF_model'
        }

        colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow
        timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5)

        start = time.time()
        rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)


        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=500, retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['model_key']
        ntree = kwargs['ntree']
        start = time.time()
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            print "Trial #", trial, "completed"
    def test_rf_change_data_key_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseResult = parseResult
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        params = {
            'ntrees': 6, 
            'destination_key': 'RF_model'
        }

        colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow
        timeoutSecs = 30 + kwargs['ntrees'] * 60 

        start = time.time()
        rfv = h2o_cmd.runRF(parseResult=parseResultTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)


        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntrees = kwargs['ntrees']
        start = time.time()
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntrees, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None, dataKeyTest, 
                model_key, ntrees, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            # FIX! should update this expected classification error
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            print "Trial #", trial, "completed"
示例#45
0
    def test_rfview_score(self):
        csvPathnameTrain = 'standard/covtype.data'
        print "Train with:", csvPathnameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathnameTrain,
                                            schema='put',
                                            hex_key="covtype.hex",
                                            timeoutSecs=15)
        dataKeyTrain = parseResultTrain['destination_key']

        csvPathnameTest = 'standard/covtype.data'
        print "Test with:", csvPathnameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathnameTest,
                                           schema='put',
                                           hex_key="covtype.hex",
                                           timeoutSecs=15)
        dataKeyTest = parseResultTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {'ntree': 13, 'out_of_bag_error_estimate': 0}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 10
            rfv = h2o_cmd.runRF(parseResult=parseResultTrain,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=1,
                                **kwargs)
            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       **kwargs)
            # new web page for predict? throw it in here for now

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if 'sampling_strategy' in kwargs and kwargs[
                    'sampling_strategy'] != 'STRATIFIED_LOCAL':
                check_err = True
            else:
                check_err = False

            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 0
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       print_params=True,
                                       **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       print_params=True,
                                       **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       print_params=True,
                                       **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            print "Trial #", trial, "completed"
示例#46
0
    def test_rf_params_rand2(self):
        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        csvPathnameTrain = h2o.find_file('smalldata/covtype/covtype.20k.data')
        print "Train with:", csvPathnameTrain
        parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.20k.hex", timeoutSecs=10)
        dataKeyTrain = parseKeyTrain['destination_key']

        csvPathnameTest = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        print "Test with:", csvPathnameTest
        parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=10)
        dataKeyTest = parseKeyTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10)
            rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
    
            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            # no_confusion_matrix=1&
            # clear_confusion_matrix=1
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            kwargs['no_confusion_matrix'] = 0
            kwargs['clear_confusion_matrix'] = 0
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            kwargs['no_confusion_matrix'] = 0
            kwargs['clear_confusion_matrix'] = 1
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            kwargs['no_confusion_matrix'] = 1
            kwargs['clear_confusion_matrix'] = 0
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            kwargs['no_confusion_matrix'] = 1
            kwargs['clear_confusion_matrix'] = 1
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            kwargs['no_confusion_matrix'] = 0
            kwargs['clear_confusion_matrix'] = 0
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            print "Trial #", trial, "completed"