예제 #1
0
    def test_rf_big1_nopoll(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(1):
            start = time.time()
            kwargs = {}
            # FIX! what model keys do these get?
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey, model_key="RF_model"+str(jobDispatch),\
                timeoutSecs=300, noPoll=True, **kwargs)
            rfViewInitial.append(rfView)
            print "rf job dispatch end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='GLMModel',
                              timeoutSecs=30,
                              pollTimeoutSecs=120,
                              retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for rfView in rfViewInitial:
            print "Checking completed job, with no polling:", rfView
            a = h2o.nodes[0].poll_url(rf['response'], noPoll=True)
            h2o_rf.simpleCheckRFView(None, a)
예제 #2
0
        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], 
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
    def test_rf_covtype_train_oobe_fvec(self):
        h2o.beta_features = True
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False)
        (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True)
        (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False)
        (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2))
        self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3))

        self.assertAlmostEqual(ce1pct1, ce2pct1, delta=0.5, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1))
        self.assertAlmostEqual(ce1pct1, ce3pct1, delta=0.5, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
예제 #4
0
    def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = { 
            'seed': '1234567890',
            # if I use 100, and just one tree, I should get same results for sorted/shuffled?
            # i.e. the bagging always sees everything. Means oobe will be messed up
            # so will specify validation = the 10pct holdout data (could reuse the training data?)
            'sample_rate': 1.0,
            'ntrees': 3, 
            'max_depth': 300,
            'nbins': 200,
            'timeoutSecs': 600,
            'response': 'C55',
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # 90% data
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1)
        self.assertEqual(4.29, classification_error1)
        self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1)
        self.assertEqual(58101, totalScores1)

        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        # 10% data
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2)
        self.assertEqual(4.29, classification_error2)
        self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2)
        self.assertEqual(58101, totalScores2)

        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

      
        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
예제 #5
0
파일: h2o_cmd.py 프로젝트: ytham/h2o
def runRFView(node=None, data_key=None, model_key=None, ntree=None, 
    timeoutSecs=15, retryDelaySecs=2, 
    noise=None, noPoll=False, noPrint=False, **kwargs):
    if not node: node = h2o.nodes[0]

    def test(n, tries=None):
        rfView = n.random_forest_view(data_key, model_key, timeoutSecs, noise=noise, **kwargs)
        status = rfView['response']['status']
        numberBuilt = rfView['trees']['number_built']

        if status == 'done': 
            if numberBuilt!=ntree: 
                raise Exception("RFview done but number_built!=ntree: %s %s", 
                    numberBuilt, ntree)
            return True
        if status != 'poll': raise Exception('Unexpected status: ' + status)

        progress = rfView['response']['progress']
        progressTotal = rfView['response']['progress_total']

        # want to double check all this because it's new
        # and we had problems with races/doneness before
        errorInResponse = \
            numberBuilt<0 or ntree<0 or numberBuilt>ntree or \
            progress<0 or progressTotal<0 or progress>progressTotal or \
            ntree!=rfView['ntree']
            ## progressTotal!=ntree or
            # rfView better always agree with what RF ntree was

        if errorInResponse:
            raise Exception("\nBad values in response during RFView polling.\n" + 
                "progress: %s, progressTotal: %s, ntree: %s, numberBuilt: %s, status: %s" % \
                (progress, progressTotal, ntree, numberBuilt, status))

        # don't print the useless first poll.
        # UPDATE: don't look for done. look for not poll was missing completion when looking for done
        if (status=='poll'):
            if numberBuilt==0:
                h2o.verboseprint(".")
            else:
                h2o.verboseprint("\nRFView polling #", tries,
                    "Status: %s. %s trees done of %s desired" % (status, numberBuilt, ntree))

        return (status!='poll')

    if noPoll:
        return None

    node.stabilize(
            test,
            'random forest reporting %d trees' % ntree,
            timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs)

    # kind of wasteful re-read, but maybe good for testing
    rfView = node.random_forest_view(data_key, model_key, timeoutSecs, noise=noise, **kwargs)
    if not kwargs.setdefault('no_confusion_matrix', False):
        h2f.simpleCheckRFView(node, rfView, noPrint=noPrint)
    return rfView
예제 #6
0
파일: h2o_cmd.py 프로젝트: zhuyuecai/h2o
def runRFScore(node=None, data_key=None, model_key=None, ntree=None, 
    timeoutSecs=15, retryDelaySecs=2, doSimpleCheck=True, **kwargs):
    if not node: node = h2o.nodes[0]

    # kind of wasteful re-read, but maybe good for testing
    rfView = node.random_forest_score(data_key, model_key, timeoutSecs, **kwargs)
    if doSimpleCheck:
        h2f.simpleCheckRFView(node, rfView, noPrint=noPrint)
    return rfView
예제 #7
0
파일: h2o_cmd.py 프로젝트: samujjwal/h2o
def runRFScore(node=None, data_key=None, model_key=None, ntree=None, 
    timeoutSecs=15, retryDelaySecs=2, doSimpleCheck=True, **kwargs):
    if not node: node = h2o.nodes[0]

    # kind of wasteful re-read, but maybe good for testing
    rfView = node.random_forest_score(data_key, model_key, timeoutSecs, **kwargs)
    if doSimpleCheck:
        h2f.simpleCheckRFView(node, rfView, noPrint=noPrint)
    return rfView
예제 #8
0
    def test_rf_covtype_train_oobe_fvec(self):
        h2o.beta_features = True
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data',
                                          checkExpectedResults=False)
        (ce1, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data',
                                          checkExpectedResults=True)
        (ce2, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data',
                                          checkExpectedResults=False)
        (ce3, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        print "rfv1:", h2o.dump_json(rfv1)
        print "rfv3:", h2o.dump_json(rfv3)
        # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        df = h2o_util.JsonDiff(rfv1, rfv3)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(
            ce1,
            ce2,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce2))
        self.assertAlmostEqual(
            ce1,
            ce3,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce3))

        self.assertAlmostEqual(
            ce1pct1,
            ce2pct1,
            delta=1.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce2pct1))
        self.assertAlmostEqual(
            ce1pct1,
            ce3pct1,
            delta=1.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce3pct1))
    def test_rf_covtype_train_full_fvec(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", 
            timeoutSecs=180)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = kwargs['ntrees'] * 60
            start = time.time()
            print "Note train.csv is used for both train and validation"
            rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            job_key = rfv['job_key']
            model_key = rfv['destination_key']
            rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], 
                model_key=model_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=1, print_params=True)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv)
            self.assertLess(classification_error, 3, "train.csv should have full classification error: %s < 3" % classification_error)

            print "Trial #", trial, "completed"
예제 #10
0
    def test_rf_multinomial_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_multinomial.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 400
        colCount = 7

        for trial in range (5):
            write_syn_dataset(csvPathname, totalRows, colCount, headerData)
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hexKey = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs)
            print "trial #", trial, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            modelKey = rfView['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
                vactual=colCount+1, vpredict=1, expectedAuc=0.5, doAUC=False)

            h2b.browseJsonHistoryAsUrlLastMatch("RF")
예제 #11
0
 def test_rf_params_rand2(self):
     csvPathname = 'space_shuttle_damage.csv'
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'stat_type': 'ENTROPY',
             'class_weights': 'yes=1000',
             'ntree': 50, 
             'response_variable': 'damage', 
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params 
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params 
         kwargs = params.copy()
         timeoutSecs = 180
         start = time.time()
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
         rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         # just to get the list of per class errors
         (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
         # why does this vary between 22 and 23
         self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes'
         self.assertLess(classErrorPctList[0],95) # class 0 is 'no'
         self.assertLess(classErrorPctList[1],29) # class 1 is 'yes'
         self.assertLess(classification_error,61)
예제 #12
0
    def test_rf_covtype_train_full_fvec(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", 
            timeoutSecs=180)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = kwargs['ntrees'] * 60
            start = time.time()
            print "Note train.csv is used for both train and validation"
            rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            job_key = rfv['job_key']
            model_key = rfv['destination_key']
            rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], 
                model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv)
            # hmm..just using defaults above in RF?
            self.assertLess(classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error)

            print "Trial #", trial, "completed"
예제 #13
0
    def test_from_import_fvec(self):
        csvFilenameAll = [
            ("covtype.data", 500),
            # ("covtype20x.data", 1000),
            ]

        for (csvFilename, timeoutSecs) in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir 
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True)
            h2o_cmd.infoFromInspect(inspect, parseResult['destination_key'])

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
            # h2o_cmd.infoFromSummary(summaryResult)

            trees = 2
            start = time.time()
            rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
                trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, 
                    trees, classification_error, classErrorPctList, totalScores)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
    def test_1ktrees_job_cancel_many_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range (1,5):
            # random 0 or 1 delay
            delay = random.uniform(0,1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3)
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
예제 #15
0
 def test_rf_params_rand2(self):
     csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv')
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'stat_type': 'ENTROPY',
             'class_weights': 'yes=1000',
             'ntree': 50, 
             'parallel': 1, 
             'response_variable': 'damage', 
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params 
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params 
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         # seems ec2 can be really slow
         timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10)
         start = time.time()
         rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         # just to get the list of per class errors
         (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
         # why does this vary between 22 and 23
         self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes'
         self.assertLess(classErrorPctList[0],95) # class 0 is 'no'
         self.assertLess(classErrorPctList[1],29) # class 1 is 'yes'
         self.assertLess(classification_error,61)
예제 #16
0
    def test_rf_float_bigexp_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 1000
        colCount = 7
        write_syn_dataset(csvPathname, totalRows, colCount, headerData)

        for trial in range(5):
            # grow the data set
            rowData = rand_rowData(colCount)
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, colCount, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   timeoutSecs=15,
                                   pollTimeoutSecs=5,
                                   **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            inspect = h2o_cmd.runInspect(key=hex_key)
            cols = inspect['cols']
            numCols = inspect['numCols']
            for i, c in enumerate(cols):
                if i < (
                        numCols - 1
                ):  # everything except the last col (output) should be 8 byte float
                    colType = c['type']
                    self.assertEqual(colType,
                                     'Real',
                                     msg="col %d should be type Real: %s" %
                                     (i, colType))

            h2o.check_sandbox_for_errors()
예제 #17
0
        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   ntrees=ntrees,
                                   max_depth=40,
                                   response=response,
                                   timeoutSecs=600,
                                   retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        ntrees=ntrees,
                                        max_depth=40,
                                        response=response,
                                        timeoutSecs=600,
                                        retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,
                    k) in enumerate(zip(classErrorPctList1,
                                        classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i],
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(
                totalError1,
                totalError2,
                delta=.2 * totalError2,
                msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(
                elapsed1,
                elapsed2,
                delta=.5 * elapsed2,
                msg="Comparing RF times for DRF2 and SpeeDRF")
예제 #18
0
    def test_rf_float_bigexp_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 1000
        colCount = 7
        write_syn_dataset(csvPathname, totalRows, colCount, headerData)

        for trial in range (5):
            # grow the data set
            rowData = rand_rowData(colCount)
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, colCount, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True)
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            # cm0 = rfView['drf_model']['cms'][0]['_arr']
            # print cm0
            # self.assertEqual(len(cm0), numCols,
            #     msg="%s cols in cm, means rf must have ignored some cols. I created data with %s cols" % (len(cm0), numCols-1))

            inspect = h2o_cmd.runInspect(key=hex_key)
            cols = inspect['cols']
            numCols = inspect['numCols']
            for i,c in enumerate(cols):
                if i < (numCols-1): # everything except the last col (output) should be 8 byte float
                    colType = c['type']
                    self.assertEqual(colType, 'Real', msg="col %d should be type Real: %s" % (i, colType))

            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
예제 #19
0
    def test_rf_hhp_2a_fvec(self):
        h2o.beta_features = True
        csvFilenameList = {
            'hhp.cut3.214.data.gz',
            }

        for csvFilename in csvFilenameList:
            csvPathname = csvFilename
            print "RF start on ", csvPathname
            dataKeyTrain = 'rTrain.hex'
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=dataKeyTrain, schema='put',
                timeoutSecs=120)            
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numCols = inspect['numCols']

            # we want the last col. Should be values 0 to 14. 14 most rare

            # from the cut3 set
            #   84777 0
            #   13392 1
            #    6546 2
            #    5716 3
            #    4210 4
            #    3168 5
            #    2009 6
            #    1744 7
            #    1287 8
            #    1150 9
            #    1133 10
            #     780 11
            #     806 12
            #     700 13
            #     345 14
            #    3488 15

            execExpr = "%s[,%s] = %s[,%s]==14" % (dataKeyTrain, numCols, dataKeyTrain, numCols)
            h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10)
            inspect = h2o_cmd.runInspect(key=dataKeyTrain)
            h2o_cmd.infoFromInspect(inspect, "going into RF")
            execResult = {'destination_key': dataKeyTrain}


            kwargs = {
                'ntrees': 20,
                'max_depth': 20,
                'nbins': 50,
            }
            rfView = h2o_cmd.runRF(parseResult=execResult, timeoutSecs=900, retryDelaySecs=10, **kwargs)
            print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
예제 #20
0
    def test_rf_big1_nopoll(self):
        csvFilename = 'hhp_107_01.data.gz'
        csvPathname = h2o.find_file("smalldata/" + csvFilename)
        key2 = csvFilename + ".hex"
        
        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=15)
        rfViewInitial = []
        rfView = {}
        # dispatch multiple jobs back to back
        for jobDispatch in range(25):
            start = time.time()
            kwargs = {}
            model_key = 'RF_model' + str(jobDispatch)
            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runRFOnly(node=randomNode, parseKey=parseKey, model_key=model_key, timeoutSecs=300, noPoll=True, **kwargs)

            print "rfView:", h2o.dump_json(rfView)
            # FIX! are these already in there?
            rfView['data_key'] = key2
            rfView['model_key'] = model_key
            rfViewInitial.append(rfView)
            print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for rfView in rfViewInitial:
            print "Checking completed job, with no polling:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True)
            h2o_rf.simpleCheckRFView(None, a)
예제 #21
0
    def test_rf_airlines_2013_fvec(self):
        h2o.beta_features = True
        h2b.browseTheCloud()

        csvFilename = 'year2013.csv'
        hex_key = 'year2013.hex'
        importFolderPath = 'airlines'
        csvPathname = importFolderPath + "/" + csvFilename
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=900,
                                       doSummary=False)
        parse_time = time.time() - start
        print "parse took {0} sec".format(parse_time)
        start = time.time()

        start = time.time()
        # noise=['JStack','cpu','disk'])
        h2o_cmd.runSummary(key=hex_key, timeoutSecs=200)
        elapsed = time.time() - start
        print "summary took {0} sec".format(elapsed)

        trees = 10
        paramsTrainRF = {
            'ntrees': trees,
            'max_depth': 20,
            'nbins': 200,
            'ignored_cols_by_name':
            'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
            'timeoutSecs': 14800,
        }
        kwargs = paramsTrainRF.copy()
        start = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

        l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
            trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format(
            len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename,
            elapsed, trees, classification_error, classErrorPctList,
            totalScores)
        print "\n" + l
        h2o.cloudPerfH2O.message(l)

        # just to make sure we test this
        h2i.delete_keys_at_all_nodes(pattern=hex_key)
    def test_1ktrees_job_cancel_many_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname,
                                                             schema='put',
                                                             hex_key=hex_key,
                                                             timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range(1, 50):
            # random 0 or 1 delay
            delay = random.uniform(0, 1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trial,
                          max_depth=50,
                          rfView=False,
                          noPoll=True,
                          timeoutSecs=30,
                          retryDelaySecs=0.25)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult,
                               trees=trial,
                               max_depth=50,
                               rfView=False,
                               noPoll=False,
                               timeoutSecs=600,
                               retryDelaySecs=3)
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
예제 #23
0
    def test_rf_float_bigexp_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 1000
        colCount = 7
        write_syn_dataset(csvPathname, totalRows, colCount, headerData)

        for trial in range (5):
            # grow the data set
            rowData = rand_rowData(colCount)
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, colCount, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'response': 'AGE',
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=15, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, ntree=ntree)

            inspect = h2o_cmd.runInspect(key=hex_key)
            cols = inspect['cols']
            #num_cols = inspect['num_cols']
            #for i,c in enumerate(cols):
            #    if i < (num_cols-1): # everything except the last col (output) should be 8 byte float
            #        colType = c['type']
            #        self.assertEqual(colType, 'float', msg="col %d should be type Real: %s" % (i, colType))
        
            h2o.check_sandbox_for_errors()
예제 #24
0
    def test_rf_multinomial_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_multinomial.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 400
        colCount = 7

        for trial in range(5):
            write_syn_dataset(csvPathname, totalRows, colCount, headerData)
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hexKey = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hexKey,
                                           doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   timeoutSecs=15,
                                   pollTimeoutSecs=5,
                                   **kwargs)
            print "trial #", trial, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            modelKey = rfView['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'],
                             modelKey=modelKey,
                             vactual=colCount + 1,
                             vpredict=1,
                             expectedAuc=0.5,
                             doAUC=False)

            h2b.browseJsonHistoryAsUrlLastMatch("RF")
예제 #25
0
    def test_rf_airlines_2013_fvec(self):
        h2o.beta_features = True
        h2b.browseTheCloud()


        csvFilename = 'year2013.csv'
        hex_key = 'year2013.hex'
        importFolderPath = 'airlines'
        csvPathname = importFolderPath + "/" + csvFilename
        start      = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', 
            path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False)
        parse_time = time.time() - start 
        print "parse took {0} sec".format(parse_time)
        start      = time.time()
        
        start = time.time()
        # noise=['JStack','cpu','disk'])
        h2o_cmd.runSummary(key=hex_key, timeoutSecs=200)
        elapsed = time.time() - start 
        print "summary took {0} sec".format(elapsed)

        trees = 10
        paramsTrainRF = { 
            'ntrees': trees, 
            'max_depth': 20,
            'nbins': 200,
            'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
            'timeoutSecs': 14800,
            }
        kwargs   = paramsTrainRF.copy()
        start      = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

        l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
            trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
            len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed,
                trees, classification_error, classErrorPctList, totalScores)
        print "\n"+l
        h2o.cloudPerfH2O.message(l)

        # just to make sure we test this
        h2i.delete_keys_at_all_nodes(pattern=hex_key)
예제 #26
0
 def test_rf_params_rand2(self):
     csvPathname = 'space_shuttle_damage.csv'
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'stat_type': 'ENTROPY',
             'class_weights': 'yes=1000',
             'ntree': 50,
             'response_variable': 'damage',
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params
         kwargs = params.copy()
         timeoutSecs = 180
         start = time.time()
         parseResult = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname,
                                        schema='put')
         rfView = h2o_cmd.runRF(parseResult=parseResult,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=1,
                                **kwargs)
         elapsed = time.time() - start
         # just to get the list of per class errors
         (classification_error, classErrorPctList,
          totalScores) = h2o_rf.simpleCheckRFView(None,
                                                  rfView,
                                                  noPrint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
             (elapsed * 100) / timeoutSecs), "\n"
         # why does this vary between 22 and 23
         self.assertAlmostEqual(totalScores, 23,
                                delta=1)  # class 1 is 'yes'
         self.assertLess(classErrorPctList[0], 95)  # class 0 is 'no'
         self.assertLess(classErrorPctList[1], 29)  # class 1 is 'yes'
         self.assertLess(classification_error, 61)
예제 #27
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED = 
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv')
     for trial in range(10):
         # params is mutable. This is default.
         params = {
             'sample': 80,
             'gini': 0,
             'class_weights': 'yes=1000',
             'ntree': 50, 
             'parallel': 1, 
             'response_variable': 'damage', 
             'ignore': 'flight',
             'ntree': 25,
             'out_of_bag_error_estimate': 1,
         }
         print "params:", params 
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         print "params:", params 
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         # seems ec2 can be really slow
         timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10)
         start = time.time()
         rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         # just to get the list of per class errors
         (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noprint=True)
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
         self.assertEqual(totalScores,23) # class 1 is 'yes'
         self.assertLess(classErrorPctList[0],82) # class 0 is 'no'
         self.assertLess(classErrorPctList[1],29) # class 1 is 'yes'
         self.assertLess(classification_error,61)
예제 #28
0
    def test_rf_covtype_train_full(self):
        h2o.beta_features = True
        csvFilename = 'train.csv'
        csvPathname = 'bench/covtype/h2o/' + csvFilename
        parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", 
            header=1, timeoutSecs=180)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            self.assertLess(classification_error, 0.02, "train.csv should have full classification error <0.02")

            print "Trial #", trial, "completed"
    def test_rf_covtype_train_full_fvec(self):
        h2o.beta_features = True
        csvFilename = "covtype.data"
        csvPathname = "standard/" + csvFilename
        parseResult = h2i.import_parse(
            bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=180
        )

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = kwargs["ntrees"] * 60
            start = time.time()
            print "Note train.csv is used for both train and validation"
            rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )

            job_key = rfv["job_key"]
            model_key = rfv["destination_key"]
            rfv = h2o_cmd.runRFView(
                data_key=parseResult["destination_key"], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1
            )

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv)
            # hmm..just using defaults above in RF?
            self.assertLess(
                classification_error,
                4.8,
                "train.csv should have full classification error: %s < 4.8" % classification_error,
            )

            print "Trial #", trial, "completed"
예제 #30
0
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        kwargs = {'str': execExpr, 'timeoutSecs': 15}
        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        paramDict = drf2ParamDict
        params = {'ntrees': 20, 'destination_key': 'RF_model'}

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        timeoutSecs = 30 + kwargs['ntrees'] * 60

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs,
                           retryDelaySecs=1,
                           **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntree = kwargs['ntrees']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None,
                          dataKeyTrain,
                          model_key,
                          ntree=ntree,
                          timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree=ntree,
                                       timeoutSecs=timeoutSecs,
                                       out_of_bag_error_estimate=0,
                                       retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(
                classification_error,
                50,
                delta=50,
                msg="Classification error %s differs too much" %
                classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                               model_key=rfModelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C55',
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
예제 #31
0
    def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = { 
            'seed': '1234567890',
            # if I use 100, and just one tree, I should get same results for sorted/shuffled?
            # i.e. the bagging always sees everything. Means oobe will be messed up
            # so will specify validation = the 10pct holdout data (could reuse the training data?)
            'sample_rate': 1.0,
            'ntrees': 3, 
            'max_depth': 300,
            'nbins': 200,
            'timeoutSecs': 600,
            'response': 'C55',
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # 90% data
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1)
        # self.assertEqual(4.29, classification_error1)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1)
        self.assertEqual(58101, totalScores1)

        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        # 10% data
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2)
        # self.assertEqual(4.29, classification_error2)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1)
        self.assertEqual(58101, totalScores2)

        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

      
        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
예제 #32
0
파일: test_RF_mnist.py 프로젝트: bikle/h2o
    def test_RF_mnist(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600), 
            # ("a.csv", "b.csv", 60),
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath,
                key2=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath,
                key2=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True)
            ntree = 10
            params = {
                'response_variable': 0,
                'ignore': ignore_x, 
                'ntree': ntree,
                'iterative_cm': 1,
                'out_of_bag_error_estimate': 1,
                # 'data_key='mnist_training.csv.hex'
                'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit': None,
                'depth': 2147483647,
                'stat_type': 'ENTROPY',
                'sampling_strategy': 'RANDOM',
                'sample': 67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key': 'RF_model',
                'bin_limit': 1024,
                'seed': 784834182943470027,
                'parallel': 1,
                'use_non_local_data': 0,
                'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
                }

            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=True,
                timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, 
                timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
예제 #33
0
    def test_rf_enums_mappings_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            # (n, 1, 'cD', 300), 
            # (n, 2, 'cE', 300), 
            # (n, 3, 'cF', 300), 
            # (n, 4, 'cG', 300), 
            # (n, 5, 'cH', 300), 
            # (n, 6, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, 
                timeoutSecs=30, separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            if SPEEDRF:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'num_trees': 1,
                    'max_depth': 100,
                    'oobee': 1,
                    'seed': 123456789,
                }
            else:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                }

            for r in range(4):
                start = time.time()
                
                if SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                
                print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
                # print h2o.dump_json(rfResult)
                (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)
                h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5)
                
                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
예제 #34
0
    def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between 
        # shuffled and non-shuffled datasets
        importFolderPath = "standard"
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
            hex_key=hex_key, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = numRows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        trial = 0
        for rowPct in [0.9]:
            trial += 1
            # Not using this now (did use it for slicing)
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r_" + csvFilename + "_" + str(trial)
            
            # just do random split for now
            dataKeyTrain = 'rTrain.hex'
            dataKeyTest = 'rTest.hex'
            createTestTrain(hex_key, dataKeyTrain, dataKeyTest, percent=0.90, outputClass=4, numCols=numCols)
            sliceResult = {'destination_key': dataKeyTrain}

            # adjust timeoutSecs with the number of trees
            kwargs = paramDict.copy()
            kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial)
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, **kwargs)

            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv)
            # oobeTrainPctRight = 100 * (1.0 - error)
            oobeTrainPctRight = 100 - error
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_selfKey']

            rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, used_trees,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfvScoring)
            fullScorePctRight = 100 - error

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        # return the last rfv done during training
        return rfv
예제 #35
0
    def test_RF_mnist_reals(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 trainCsvFilename,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseKey['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntree = 100
            params = {
                'response_variable':
                0,
                'ignore':
                ignore_x,
                'ntree':
                ntree,
                'iterative_cm':
                1,
                'out_of_bag_error_estimate':
                1,
                # 'data_key='mnist_reals_training.csv.hex'
                'features':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit':
                None,
                'depth':
                2147483647,
                'stat_type':
                'ENTROPY',
                'sampling_strategy':
                'RANDOM',
                'sample':
                67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key':
                'RF_model',
                'bin_limit':
                1024,
                'seed':
                784834182943470027,
                'parallel':
                1,
                'use_non_local_data':
                0,
                'class_weights':
                '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       rfView=False,
                                       timeoutSecs=timeoutSecs,
                                       pollTimeoutsecs=60,
                                       retryDelaySecs=2,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2,
                                       model_key=modelKey,
                                       ntree=ntree,
                                       out_of_bag_error_estimate=0,
                                       timeoutSecs=60,
                                       pollTimeoutSecs=60,
                                       noSimpleCheck=False,
                                       **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            self.assertAlmostEqual(
                classification_error,
                0.03,
                delta=0.5,
                msg="Classification error %s differs too much" %
                classification_error)
            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
예제 #36
0
    def test_rf_log_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 100, 'cA', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            # CREATE test dataset******************************************************
            csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            testParseResult = h2i.import_parse(path=csvPathname,
                                               hex_key=hex_key,
                                               schema='put',
                                               timeoutSecs=10)
            print "Test Parse result['destination_key']:", testParseResult[
                'destination_key']
            dataKeyTest = testParseResult['destination_key']

            # CREATE train dataset******************************************************
            csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            trainParseResult = h2i.import_parse(path=csvPathname,
                                                hex_key=hex_key,
                                                schema='put',
                                                timeoutSecs=10)
            print "Train Parse result['destination_key']:", trainParseResult[
                'destination_key']
            dataKeyTrain = trainParseResult['destination_key']

            # RF train******************************************************
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # do oobe
            kwargs['response'] = "C" + str(colCount + 1)

            rfv = h2o_cmd.runRF(parseResult=trainParseResult,
                                timeoutSecs=timeoutSecs,
                                **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees)
            oobeTrainPctRight = 100.0 - classification_error
            expectTrainPctRight = 94
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\
                msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5)

            # RF score******************************************************
            print "Now score with the 2nd random dataset"
            rfv = h2o_cmd.runRFView(data_key=dataKeyTest,
                                    model_key=model_key,
                                    timeoutSecs=timeoutSecs,
                                    retryDelaySecs=1)

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees)
            self.assertTrue(classification_error <= 5.0,
                            msg="Classification error %s too big" %
                            classification_error)
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)

            fullScorePctRight = 100.0 - classification_error
            expectScorePctRight = 94
            self.assertTrue(
                fullScorePctRight >= expectScorePctRight,
                msg="Full: pct. right for scoring not close enough %6.2f %6.2f"
                % (fullScorePctRight, expectScorePctRight),
                delta=5)
예제 #37
0
    def test_export_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        h2o.beta_features = True # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        trainKey = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=trainKey,
            timeoutSecs=180, doSummary=False)
        inspect = h2o_cmd.runInspect(None, trainKey)

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        testKey = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=testKey,
            timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, testKey)


        trial = 0
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList  = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200

            
            # do ten starts, to see the bad id problem?
            trial += 1
            kwargs = paramDict.copy()
            modelKey = 'RFModel_' + str(trial)
            kwargs['destination_key'] = modelKey

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, **kwargs)
            trainElapsed = time.time() - start
            print 'rf train end on', csvTrainPathname, 'took', trainElapsed, 'seconds'

            h2o.nodes[0].export_files(src_key=testKey, path=SYNDATASETS_DIR + "/" + testKey, force=1)
            h2o.nodes[0].export_files(src_key=trainKey, path=SYNDATASETS_DIR + "/" + trainKey, force=1)
            # h2o.nodes[0].export_files(src_key=modelKey, path=SYNDATASETS_DIR + "/" + modelKey, force=1)


            rf_model = rfResult['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)
            print "classErrorPctList:", classErrorPctList
            self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey)
예제 #38
0
    def test_rfview_score(self):
        csvPathnameTrain = 'UCI/UCI-large/covtype/covtype.data'
        print "Train with:", csvPathnameTrain
        parseResultTrain = h2i.import_parse(bucket='datasets', path=csvPathnameTrain, schema='put', 
            hex_key="covtype.hex", timeoutSecs=15)
        dataKeyTrain = parseResultTrain['destination_key']

        csvPathnameTest = 'UCI/UCI-large/covtype/covtype.data'
        print "Test with:", csvPathnameTest
        parseResultTest = h2i.import_parse(bucket='datasets', path=csvPathnameTest, schema='put', 
            hex_key="covtype.hex", timeoutSecs=15)
        dataKeyTest = parseResultTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5)
            rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # new web page for predict? throw it in here for now

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if 'sampling_strategy' in kwargs and kwargs['sampling_strategy'] != 'STRATIFIED_LOCAL':
                check_err = True
            else:
                check_err = False

            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 0
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, 
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            print "Trial #", trial, "completed"
예제 #39
0
    def test_rf_predict3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 1:
            y = 4  # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            translate = None
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        else:
            y = 0  # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey + "=" + hex_key,
                          timeoutSecs=30)  # unneeded but interesting
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) +
                          "]",
                          timeoutSecs=30)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=model_key,
                data_key=hexKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time(
            ) - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex",
                                      csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey,
                                      csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(
                csvSrcOutputPathname,
                msg="Original",
                colIndex=0,
                translate=translate,
                skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput) = compare_csv_at_one_col(
                csvPredictPathname,
                msg="Predicted",
                colIndex=0,
                skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1 - skipSrcOutputHeader) !=
                (rowNum2 - skipPredictHeader)):
                raise Exception(
                    "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" %
                    (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput,
                                                predictOutput)):
                # if float(o)!=float(p):
                if str(o) != str(p):
                    if wrong == 10:
                        print "Not printing any more mismatches\n"
                    elif wrong < 10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s" % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 2.0:
                raise Exception(
                    "pctWrong too high. Expect < 2% error because it's reusing training data"
                )
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        kwargs = {
            'destination_key': 'rf_model',
            'response': response,
            'ntrees': trees,
            'classification': 1,
        }

        rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                      timeoutSecs=timeoutSecs,
                                      **kwargs)
        rfResult["drf_model"] = rfResult.pop("speedrf_model")
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y
        pctWrong = predict_and_compare_csvs(model_key='rf_model',
                                            hex_key=hexKey,
                                            translate=translate,
                                            y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now
        self.assertAlmostEqual(
            pctWrong,
            expectedPctWrong,
            delta=0.7,
            msg=
            "predicted pctWrong: %s should be small because we're predicting with training data"
            % pctWrong)
예제 #40
0
    def test_RF_mnist_both(self):
        importFolderPath = "/home/0xdiag/datasets/mnist_repl"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed,
             parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 parsePattern,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseKey['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntree = 100
            params = {
                'response_variable':
                0,
                'ignore':
                ignore_x,
                'ntree':
                ntree,
                'iterative_cm':
                1,
                'out_of_bag_error_estimate':
                1,
                # 'data_key='mnist_training.csv.hex'
                'features':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit':
                None,
                'depth':
                2147483647,
                'stat_type':
                'ENTROPY',
                'sampling_strategy':
                'RANDOM',
                'sample':
                67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key':
                'RF_model',
                'bin_limit':
                1024,
                # 'seed': 784834182943470027,
                'parallel':
                1,
                'use_non_local_data':
                0,
                'class_weights':
                '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            if rfSeed is None:
                params['seed'] = random.randint(0, sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       rfView=False,
                                       timeoutSecs=timeoutSecs,
                                       pollTimeoutsecs=60,
                                       retryDelaySecs=2,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2,
                                       model_key=modelKey,
                                       ntree=ntree,
                                       out_of_bag_error_estimate=0,
                                       timeoutSecs=60,
                                       pollTimeoutSecs=60,
                                       noSimpleCheck=False,
                                       **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            print "classification error is expected to be low because we included the test data in with the training!"
            self.assertAlmostEqual(
                classification_error,
                0.028,
                delta=0.01,
                msg="Classification error %s differs too much" %
                classification_error)

            leaves = rfView['trees']['leaves']
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100
                d = "seed: %s leaves %s %s %s pct. different %s" % (
                    params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = rfView['trees']['depth']
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100
                d = "seed: %s depth %s %s %s pct. different %s" % (
                    params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"
        for d in allDelta:
            print d
예제 #41
0
    def test_RF_mnist_both(self):
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename,
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern,
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            # print "This is the 'ignore=' we'll use"
            # no longer use. depend on h2o to get it right.
            ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)
            ntree = 25
            params = {
                'response_variable': 0,
                # 'ignore': ignore_x, 
                'ntree': ntree,
                'iterative_cm': 1,
                'out_of_bag_error_estimate': 1,
                # 'data_key='mnist_training.csv.hex'
                'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit': None,
                'depth': 2147483647,
                'stat_type': 'ENTROPY',
                'sampling_strategy': 'RANDOM',
                'sample': 67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key': 'RF_model',
                'bin_limit': 1024,
                # 'seed': 784834182943470027,
                'use_non_local_data': 1,
               #  'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
                }

            if rfSeed is None:
                params['seed'] = random.randint(0,sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True,
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, 
                timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            print "classification error is expected to be low because we included the test data in with the training!"
            self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error)
        
            leaves = rfView['trees']['leaves']
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100
                d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = rfView['trees']['depth']
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100
                d = "seed: %s %s depth: %s expected: %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"
    
        for d in allDelta:
            print d
예제 #42
0
    def test_rf_mnist_both_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename,
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern,
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            print "Not using ignore from this..have to adjust cols?"
            h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)
            ntree = 2
            params = {
                'response': 'C1',
                # 'ignored_cols_by_name': ignore_x, 
                'ntrees': ntree,
                'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': 'RF_model',
                'nbins': 100,
                'importance': 0,
                'balance_classes': 0,
                }

            if rfSeed is None:
                params['seed'] = random.randint(0,sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True,
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # print 'rfView:', h2o.dump_json(rfView)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['drf_model']['_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, 
                timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            # training and test data are unique, so error won't be low?
            # self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error)

            leaves = {
                'min': rfView['drf_model']['treeStats']['minLeaves'],
                'mean': rfView['drf_model']['treeStats']['meanLeaves'],
                'max': rfView['drf_model']['treeStats']['maxLeaves'],
            }
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 537, 'mean': 1118.05, 'max': 1701}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100
                d = "seed: %s leaves %s %s %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {
                'min': rfView['drf_model']['treeStats']['minDepth'],
                'mean': rfView['drf_model']['treeStats']['meanDepth'],
                'max': rfView['drf_model']['treeStats']['maxDepth'],
            }
            depthExpected = {'min': 20, 'mean': 20, 'max': 20}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100
                d = "seed: %s depth %s %s %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"
    
        for d in allDelta:
            print d
예제 #43
0
    def test_RF_mnist_reals_fvec(self):
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600), 
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600), 
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz",    600), 
        ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename,
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename,
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True)
            ntrees = 10
            params = {
                'response': 'C1',
                'ignored_cols_by_name': ignore_x, 
                'ntrees': ntrees,
                'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 15,
                'sample_rate': 0.67,
                'destination_key': 'RF_model',
                'nbins': 1024,
                'seed': 784834182943470027,
                'importance': 0,
                'balance_classes': 0,
                }

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfv = h2o_cmd.runRF(parseResult=parseResult, rfView=True,
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfv, **params)
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']


            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfv = h2o_cmd.runRFView(data_key=testKey2, model_key=model_key, ntrees=ntrees, out_of_bag_error_estimate=0, 
                timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfv, **params)
            self.assertAlmostEqual(classification_error, 9, delta=1.0, msg="Classification error %s differs too much" % classification_error)
            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
예제 #44
0
    def test_RF_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600),
            # ("a.csv", "b.csv", 60),
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           testCsvFilename,
                                           hex_key=testKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           trainCsvFilename,
                                           schema='local',
                                           hex_key=trainKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseResult['destination_key'],
                timeoutSecs=300,
                forRF=True)

            params = {
                'response': 'C' + str(y),
                'cols': None,
                'ignored_cols_by_name': ignore_x,
                'classification': 1,
                'validation': None,
                'ntrees': 10,
                'max_depth': 20,
                'min_rows': None,
                'nbins': 1000,
                'mtries': None,
                'sample_rate': 0.66,
                'seed': None,
            }

        rfViewInitial = []
        for jobDispatch in range(1):
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            params['destination_key'] = 'RFModel_' + str('jobDispatch')
            kwargs = params.copy()
            timeoutSecs = 1200

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     noPoll=not DO_POLL,
                                     rfView=DO_POLL,
                                     **kwargs)
            elapsed = time.time() - start

            # print h2o.dump_json(rfResult)
            print "rf job dispatch end on ", trainCsvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = trainKey2
            rfView['model_key'] = kwargs['destination_key']
            rfView['ntrees'] = kwargs['ntrees']
            rfViewInitial.append(rfView)

            if not DO_POLL:
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200,
                                            pollTimeoutSecs=120,
                                            retryDelaySecs=5)

        # FIX! need to add the rfview and predict stuff
        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntrees = rfView['ntrees']

            rfView = h2o_cmd.runRFView(None,
                                       model_key=model_key,
                                       timeoutSecs=60,
                                       noPoll=not DO_POLL,
                                       doSimpleCheck=False)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            self.assertAlmostEqual(
                classification_error,
                10,
                delta=2,
                msg="Classification error %s differs too much" %
                classification_error)

            if not DO_POLL:
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=300,
                                            pollTimeoutSecs=120,
                                            retryDelaySecs=5)
            # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False)
            # print "rfView:", h2o.dump_json(rfView)

            # "N":1,
            # "errs":[0.25,0.1682814508676529],
            # "testKey":"syn_binary_10000x10.hex",
            # "cm":[[3621,1399],[1515,3465]]}}
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']

            # FIX! should update this expected classification error
            ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=data_key)
예제 #45
0
    def test_rf_enums_score_superset_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for rf model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            ntrees = 5
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'classification': 1,
                'ntrees': ntrees,
                'validation': scoreDataKey,
            }

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
            print "rf end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult,
                                                     ntree=ntrees)
            predictKey = 'Predict.hex'
            h2o_cmd.runScore(dataKey=scoreDataKey,
                             modelKey=modelKey,
                             vactual=y,
                             vpredict=1,
                             expectedAuc=0.5)
예제 #46
0
    def test_rf_enums_mappings(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            # (n, 4, 'cG', 300),
            # (n, 5, 'cH', 300),
            # (n, 6, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
        ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname,
                                         enumList,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname,
                                         enumListForScore,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            # use mtries so both look at all cols at every split? doesn't matter for speedrf
            # does speedrf try one more time? with 3 cols, mtries=2, so another try might
            # get a look at the missing col
            # does matter for drf2. does it "just stop"
            # trying mtries always looking at all columns or 1 col might be interesting
            if SPEEDRF:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'ntrees': 1,
                    'max_depth': 100,
                    # 'oobee': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'mtries': COLS,
                }
            elif GBM:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                    # 'learn_rate': .1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'classification': 1,
                }
            else:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'nbins': 1024,
                    'mtries': COLS,
                }

            for r in range(2):
                start = time.time()

                if GBM:
                    gbmResult = h2o_cmd.runGBM(parseResult=parseResult,
                                               timeoutSecs=timeoutSecs,
                                               pollTimeoutSecs=180,
                                               **kwargs)

                    print "gbm end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    # print h2o.dump_json(gbmResult)
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult)

                elif SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                                  timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=180,
                                                  **kwargs)
                    print "speedrf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             pollTimeoutSecs=180,
                                             **kwargs)
                    print "rf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                h2o_cmd.runScore(dataKey=scoreDataKey,
                                 modelKey=modelKey,
                                 vactual=y,
                                 vpredict=1,
                                 doAUC=not MULTINOMIAL)  # , expectedAuc=0.5)

                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
예제 #47
0
    def test_speedrf_mnist(self):
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600), 
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600), 
            ("train.csv.gz", "test.csv.gz", 600),
            ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + testCsvFilename,
                                           hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 784 # last column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + trainCsvFilename,
                                           hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True)
            ntrees = 10
            params = {
                'response': y,
                'ignored_cols_by_name': ignore_x,
                'ntrees': ntrees,
                'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 15,
                'sample_rate': 0.67,
                'destination_key': 'SpeeDRF_model',
                'nbins': 1024,
                'seed': 784834182943470027,
                'oobee': 1,
                }
            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            rfv["drf_model"] = rfv.pop("speedrf_model")
            h2o_rf.simpleCheckRFView(None, rfv, **params)
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            print "Total trees: ", used_trees
            print "On data key: ", data_key
            print "Produced model key: ", model_key
    def test_rf_change_data_key_fvec(self):
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            timeoutSecs=500)
        h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseResult = parseResult
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        params = {'ntrees': 2, 'destination_key': 'RF_model'}

        # colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        kwargs["response"] = "C55"
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow

        timeoutSecs = 100
        start = time.time()
        h2o_cmd.runSpeeDRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs,
                           retryDelaySecs=1,
                           noPoll=True,
                           **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)

        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model',
                              timeoutSecs=360,
                              pollTimeoutSecs=120,
                              retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntrees = kwargs['ntrees']
        start = time.time()
        h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            # FIX! should update this expected classification error
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            print "Trial #", trial, "completed"
예제 #49
0
    def test_RF_mnist_both(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult,
         importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                          path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed,
             parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           testCsvFilename,
                                           hex_key=testKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           parsePattern,
                                           hex_key=trainKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            # print "This is the 'ignore=' we'll use"
            # no longer use. depend on h2o to get it right.
            ntree = 25
            params = {
                'response': 0,
                'ntrees': ntree,
                # 'data_key='mnist_training.csv.hex'
                'mtries':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 2147483647,
                'select_stat_type': 'ENTROPY',
                'sampling_strategy': 'RANDOM',
                'sample_rate': 0.67,
                'oobee': 1,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'destination_key': 'RF_model',
                'nbins': 1024,
                # 'seed': 784834182943470027,
                # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            if rfSeed is None:
                params['seed'] = random.randint(0, sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        timeoutSecs=timeoutSecs,
                                        pollTimeoutSecs=180,
                                        retryDelaySecs=2,
                                        **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # RFView (score on test)****************************************
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            # was 2.84
            # sometimes get 2.87?
            self.assertAlmostEqual(
                classification_error,
                1.6,
                delta=1.6,
                msg="Classification error %s differs too much" %
                classification_error)

            treeStats = rfView['speedrf_model']['treeStats']
            leaves = {
                'min': treeStats['minLeaves'],
                'mean': treeStats['meanLeaves'],
                'max': treeStats['maxLeaves']
            }
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (
                    params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {
                'min': treeStats['minDepth'],
                'mean': treeStats['meanDepth'],
                'max': treeStats['maxDepth']
            }
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s depth: %s expected: %s pct. different %s" % (
                    params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            modelKey = rfView['speedrf_model']['_key']
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"

        for d in allDelta:
            print d
예제 #50
0
def run_rf(files,configs):
    overallWallStart = time.time()
    output = None
    #if not os.path.exists('rfbench.csv'):
    #    output = open('rfbench.csv','w')
    #    output.write(','.join(csv_header)+'\n')
    #else:
    #    output = open('rfbench.csv','a')
    #csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
    #            dialect='excel', extrasaction='ignore',delimiter=',')
    #csvWrt.writeheader()
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        #Train File Parsing#
        trainParseWallStart = time.time()
        print "Training file is: ", files['train']
        importFolderPath = "mnist/mnist8m"
        csvPathname = importFolderPath + "/" + files['train']
        hex_key = files['train'] + '.hex'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key
                        timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120)
        trainParseWallTime = time.time() - trainParseWallStart
        #End Train File Parse#

        inspect = h2o.nodes[0].inspect(parseResult['destination_key'])
        row = {'java_heap_GB':java_heap_GB,'dataset':'mnist8m',
                'nTrainRows': inspect['numRows'],'nCols':inspect['numCols'],
                #'nIgnoredCols':nIgnoredCols,'ignoredCols':ignoredCols,
                'trainParseWallTime':trainParseWallTime}

        #RF+RFView (train)#
        kwargs = configs.copy()
        trainRFStart = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult,rfView=True,
             timeoutSecs= 3600,pollTimeoutSecs= 60,retryDelaySecs = 2, **kwargs)
        trainViewTime = time.time() - trainRFStart
        #End RF+RFView (train)#
        row.update({'trainViewTime':trainViewTime})
        
        h2o_rf.simpleCheckRFView(None, rfView, **kwargs)
        modelKey = rfView['model_key']
        
        #Test File Parsing#
        testParseWallStart = time.time()
        print "Testing file is: ", files['test']
        importFolderPath = "mnist/mnist8m"
        csvPathname = importFolderPath + "/" + files['test']
        hex_key = files['test'] + '.hex'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key
                        timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120)
        testParseWallTime = time.time() - testParseWallStart
        #End Test File Parse#
        inspect = h2o.nodes[0].inspect(parseResult['destination_key'])
        row.update({'nTestRows':inspect['numRows']})
        row.update({'testParseWallTime':testParseWallTime})
        modelKey = rfView['model_key']
        
        #RFView (score on test)#
        kwargs = configs.copy()
        testRFStart = time.time()
        kwargs.update({'model_key':modelKey,'ntree':10})
        rfView = h2o_cmd.runRFView(data_key=hex_key,timeoutSecs=180,
                                       doSimpleCheck=False,**kwargs)
        testViewTime = time.time() - testRFStart
        #End RFView (score on test)#
        pprint(rfView)
        errRate = rfView['confusion_matrix']['classification_error']
        row.update({'testViewTime':testViewTime})
        overallWallTime = time.time() - overallWallStart 
        row.update({'overallWallTime':overallWallTime})
        row.update({'errRate':errRate})
        print row
        #csvWrt.writerow(row)
        #h2o.nodes[0].remove_key(k)
    finally:
        output.close()
예제 #51
0
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        if h2o.beta_features:
            kwargs = {'str': execExpr, 'timeoutSecs': 15}
        else:
            kwargs = {'expression': execExpr, 'timeoutSecs': 15}

        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        if h2o.beta_features:
            paramDict = drf2ParamDict
            params = {
                'ntrees': 20, 
                'destination_key': 'RF_model'
            }
        else:
            paramDict = drf1ParamDict
            params = {
                'ntree': 20, 
                'out_of_bag_error_estimate': 1, 
                'model_key': 'RF_model'
            }

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        if h2o.beta_features:
            timeoutSecs = 30 + kwargs['ntrees'] * 60
        else:
            timeoutSecs = 30 + kwargs['ntree'] * 60 

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        if h2o.beta_features:
            model_key = kwargs['destination_key']
            ntree = kwargs['ntrees']
        else:
            model_key = kwargs['model_key']
            ntree = kwargs['ntree']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None, dataKeyTest, 
                model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(classification_error, 50, delta=50, 
                msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey  = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=parseKey,
                model_key=rfModelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C54',
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
예제 #52
0
    def test_rf_predict3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1==1:
            y = 4 # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            translate = None
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        else:
            y = 0 # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True)

        def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=hexKey, destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
                                                               msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
                                                               msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
                raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                # if float(o)!=float(p):
                if str(o)!=str(p):
                    if wrong==10:
                        print "Not printing any more mismatches\n"
                    elif wrong<10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s"  % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 2.0:
                raise Exception("pctWrong too high. Expect < 2% error because it's reusing training data")
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        kwargs = {
            'destination_key': 'rf_model',
            'response': response,
            'ntrees': trees,
            'classification': 1,
            }


        rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        rfResult["drf_model"] = rfResult.pop("speedrf_model")
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y
        pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, 
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now
        self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 0.7,
                               msg="predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
예제 #53
0
    def test_rfview_score(self):
        csvPathnameTrain = 'standard/covtype.data'
        print "Train with:", csvPathnameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathnameTrain,
                                            schema='put',
                                            hex_key="covtype.hex",
                                            timeoutSecs=15)
        dataKeyTrain = parseResultTrain['destination_key']

        csvPathnameTest = 'standard/covtype.data'
        print "Test with:", csvPathnameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathnameTest,
                                           schema='put',
                                           hex_key="covtype.hex",
                                           timeoutSecs=15)
        dataKeyTest = parseResultTest['destination_key']

        for trial in range(5):
            # params is mutable. This is default.
            params = {'ntree': 13, 'out_of_bag_error_estimate': 0}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 10
            rfv = h2o_cmd.runRF(parseResult=parseResultTrain,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=1,
                                **kwargs)
            ### print "rf response:", h2o.dump_json(rfv)

            model_key = rfv['model_key']
            # pop the stuff from kwargs that were passing as params
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       **kwargs)
            # new web page for predict? throw it in here for now

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if 'sampling_strategy' in kwargs and kwargs[
                    'sampling_strategy'] != 'STRATIFIED_LOCAL':
                check_err = True
            else:
                check_err = False

            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 0
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       print_params=True,
                                       **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       print_params=True,
                                       **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            kwargs['iterative_cm'] = 1
            kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7'
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree,
                                       timeoutSecs,
                                       retryDelaySecs=1,
                                       print_params=True,
                                       **kwargs)
            # FIX! should update this expected classification error
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            # don't check error if stratified
            if check_err:
                self.assertAlmostEqual(
                    classification_error,
                    0.03,
                    delta=0.5,
                    msg="Classification error %s differs too much" %
                    classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            elapsed = time.time() - start
            print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.'

            print "Trial #", trial, "completed"
예제 #54
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvTrainPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=180,
                                            doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvTestPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200

            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i == (TRIES - 1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs,
                                         noPoll=True,
                                         rfView=False,
                                         **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)

            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runRFView(None,
                                       model_key=model_key,
                                       timeoutSecs=60,
                                       retryDelaySecs=5,
                                       doSimpleCheck=False)
            print "rfView:", h2o.dump_json(rfView)

            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(
                len(classErrorPctList), 7,
                "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict"
            )
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
예제 #55
0
    def test_rf_log_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 100, 'cA', 300),
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            # CREATE test dataset******************************************************
            csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10)
            print "Test Parse result['destination_key']:", testParseResult['destination_key']
            dataKeyTest = testParseResult['destination_key']

            # CREATE train dataset******************************************************
            csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10)
            print "Train Parse result['destination_key']:", trainParseResult['destination_key']
            dataKeyTrain = trainParseResult['destination_key']


            # RF train******************************************************
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # do oobe
            kwargs['response'] = "C" + str(colCount+1)
            
            rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees)
            oobeTrainPctRight = 100.0 - classification_error
            expectTrainPctRight = 94
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\
                msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5)

            # RF score******************************************************
            print "Now score with the 2nd random dataset"
            rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, 
                timeoutSecs=timeoutSecs, retryDelaySecs=1)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees)
            self.assertTrue(classification_error<=5.0, msg="Classification error %s too big" % classification_error)
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            fullScorePctRight = 100.0 - classification_error
            expectScorePctRight = 94
            self.assertTrue(fullScorePctRight >= expectScorePctRight,
                msg="Full: pct. right for scoring not close enough %6.2f %6.2f"% (fullScorePctRight, expectScorePctRight), delta=5)