예제 #1
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvTrainPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=180,
                                            doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvTestPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200

            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i == (TRIES - 1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult,
                                              timeoutSecs=timeoutSecs,
                                              noPoll=True,
                                              **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)

            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            print "rfView:", h2o.dump_json(rfView)

            rfView["drf_model"] = rfView.pop("speedrf_model")
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(
                len(classErrorPctList), 7,
                "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict"
            )
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
    def test_rf_change_data_key_fvec(self):
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            timeoutSecs=500)
        h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseResult = parseResult
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        params = {'ntrees': 2, 'destination_key': 'RF_model'}

        # colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        kwargs["response"] = "C55"
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow

        timeoutSecs = 100
        start = time.time()
        h2o_cmd.runSpeeDRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs,
                           retryDelaySecs=1,
                           noPoll=True,
                           **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)

        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model',
                              timeoutSecs=360,
                              pollTimeoutSecs=120,
                              retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntrees = kwargs['ntrees']
        start = time.time()
        h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            # FIX! should update this expected classification error
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            print "Trial #", trial, "completed"
예제 #3
0
    def test_rf_change_data_key_fvec(self):
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
        h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseResult = parseResult
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        params = {
            'ntrees': 2,
            'destination_key': 'RF_model'
        }

        # colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        kwargs["response"] = "C55"
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow

        timeoutSecs = 100
        start = time.time()
        h2o_cmd.runSpeeDRF(parseResult=parseResultTrain,
                            timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)


        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntrees = kwargs['ntrees']
        start = time.time()
        h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runSpeeDRFView(None,  model_key,timeoutSecs)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            # FIX! should update this expected classification error
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            print "Trial #", trial, "completed"
예제 #4
0
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"

        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename,
                                       hex_key=hex_key, timeoutSecs=15, schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            model_key = ""
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'SRF_model'
            else:
                model_key = 'SRF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)
            kwargs['response'] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key,
                          timeoutSecs=300, noPoll=False, **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            print "\n MODEL KEY: ", model_key
            rfViewInitial.append(h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60))

        # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            model_key = rfView["speedrf_model"]['_key']
            ntree = rfView["speedrf_model"]["parameters"]['ntrees']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            if first is None: # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
예제 #5
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key,
                                            timeoutSecs=180, doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList  = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList  = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200


            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i==(TRIES-1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs,
                                         noPoll=True, **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)


            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            print "rfView:", h2o.dump_json(rfView)

            rfView["drf_model"] = rfView.pop("speedrf_model")
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_rf_big1_overwrite_model_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        print "\n" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename,
                                       hex_key=hex_key, timeoutSecs=15, schema='put')
        firstRfView = None
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)

            print "Change the number of trees, while keeping the rf model key name the same"
            print "Checks that we correctly overwrite previous rf model"
            if OVERWRITE_RF_MODEL:
                kwargs['num_trees'] = 1 + jobDispatch
            else:
                kwargs['num_trees'] = 1
                # don't change the seed if we're overwriting the model. It should get 
                # different results just from changing the tree count
                kwargs['seed'] = random.randint(0, sys.maxint)

            kwargs["response"] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300,
                          noPoll=True, **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['_dataKey'] = hex_key
            rfView['_key'] = model_key

            print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            # we're going to compare rf results to previous as we go along (so we save rf view results
            h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # In this test we're waiting after each one, so we can save the RFView results for comparison to future
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            if firstRfView is None: # we'll use this to compare the others
                firstRfView = rfViewResult.copy()
                firstModelKey = model_key
                print "firstRfView", h2o.dump_json(firstRfView)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)
                self.assertGreater(len(df.difference), 29,
                                   msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                                       (len(df.difference), h2o.dump_json(df.difference)))
예제 #7
0
    def test_rf_covtype20x_fvec(self):
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        kwargs = {'str': execExpr, 'timeoutSecs': 15}
        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        paramDict = drf2ParamDict
        params = {
            'ntrees': 20,
            'destination_key': 'RF_model',
            'response': "C55",
        }

        # colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        timeoutSecs = 30 + kwargs['ntrees'] * 60

        start = time.time()
        h2o_cmd.runSpeeDRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntree = kwargs['ntrees']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(classification_error, 50, delta=50,
                                   msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey  = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=parseKey,
                model_key=rfModelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C55',
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
예제 #8
0
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"

        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=hex_key,
                                       timeoutSecs=15,
                                       schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            model_key = ""
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'SRF_model'
            else:
                model_key = 'SRF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)
            kwargs['response'] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runSpeeDRF(node=randomNode,
                               parseResult=parseResult,
                               destination_key=model_key,
                               timeoutSecs=300,
                               noPoll=False,
                               **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            print "\n MODEL KEY: ", model_key
            rfViewInitial.append(
                h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60))

        # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            model_key = rfView["speedrf_model"]['_key']
            ntree = rfView["speedrf_model"]["parameters"]['ntrees']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None,
                                                  model_key,
                                                  timeoutSecs=60)
            if first is None:  # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       first,
                                       vice_versa=True,
                                       with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
    def test_rf_big1_overwrite_model_fvec(self):
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        print "\n" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=hex_key,
                                       timeoutSecs=15,
                                       schema='put')
        firstRfView = None
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)

            print "Change the number of trees, while keeping the rf model key name the same"
            print "Checks that we correctly overwrite previous rf model"
            if OVERWRITE_RF_MODEL:
                kwargs['ntrees'] = 1 + jobDispatch
            else:
                kwargs['ntrees'] = 1
                # don't change the seed if we're overwriting the model. It should get
                # different results just from changing the tree count
                kwargs['seed'] = random.randint(0, sys.maxint)

            kwargs["response"] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runSpeeDRF(node=randomNode,
                               parseResult=parseResult,
                               destination_key=model_key,
                               timeoutSecs=300,
                               noPoll=True,
                               **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['_dataKey'] = hex_key
            rfView['_key'] = model_key

            print "rf job dispatch end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            # we're going to compare rf results to previous as we go along (so we save rf view results
            h2o_jobs.pollWaitJobs(pattern='RF_model',
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)

            # In this test we're waiting after each one, so we can save the RFView results for comparison to future
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None,
                                                  model_key,
                                                  timeoutSecs=60)
            if firstRfView is None:  # we'll use this to compare the others
                firstRfView = rfViewResult.copy()
                firstModelKey = model_key
                print "firstRfView", h2o.dump_json(firstRfView)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       firstRfView,
                                       vice_versa=True,
                                       with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)
                self.assertGreater(len(df.difference), 29,
                                   msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                                       (len(df.difference), h2o.dump_json(df.difference)))