Пример #1
0
    def test_RF(self):
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        print "\nTrain1\n=========={0}".format(
            h2o_rf.pp_rf_result(trainResult1))
        print "\nScore1\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult1))

        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        print "\nTrain2\n=========={0}".format(
            h2o_rf.pp_rf_result(trainResult2))
        print "\nScore2\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult2))

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #2
0
    def test_exec2_runif(self):
        print "h2o syntax is not full R. Doesn't take min/max interval params. assumed 0/1 interval"
        print " just one param, it must be a column or row vector. Result is same length"
        print " R allows a scalar to be param"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            'r0.hex = r.hex[,1]',
            's0.hex = runif(r.hex[,1],-1)',
            's1.hex = runif(r.hex[,1],-1)',
            's2.hex = runif(r.hex[,1],-1)',
            # error. this causes exception
            # 's3.hex = runif(nrow(r.hex), -1)',
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            (resultExec, result) = h2e.exec_expr(
                execExpr=execExpr, timeoutSecs=30)  # unneeded but interesting
            results.append(result)
            print "exec end on ", "operators", 'took', time.time(
            ) - start, 'seconds'
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(resultExec)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1 == 0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [
                0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567,
                1859.0, 1859.0
            ])
    def test_rf_covtype_train_oobe(self):
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data',
                                          checkExpectedResults=False)

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data',
                                          checkExpectedResults=True)

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data',
                                          checkExpectedResults=False)

        print "rfv3, from covtype.sorted.data"
        print h2o.dump_json(rfv3)
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
        ## self.assertEqual(len(df.difference), 0,
        ##    msg="Want 0 , not %d differences between the two rfView json responses. %s" % \
        ##        (len(df.difference), h2o.dump_json(df.difference)))
        ce1 = rfv1['confusion_matrix']['classification_error']
        ce3 = rfv3['confusion_matrix']['classification_error']
        self.assertAlmostEqual(
            ce1,
            ce3,
            places=3,
            msg="classication error %s isn't close to that when sorted %s" %
            (ce1, ce3))
Пример #4
0
    def test_exec2_runif(self):
        print "in h2o-dev, params are column, min, max, seed"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            # params for h2o-dev runif are: column, min, max, seed
            AssignObj('r0.hex', KeyIndexed('r.hex', col=0)),
            AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0),
                                    1)),
            AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1),
                                    -1)),
            AssignObj('s2.hex',
                      Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)),
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            results.append(result)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1 == 0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [
                0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567,
                1859.0, 1859.0
            ])
Пример #5
0
    def test_rf_covtype_train_oobe_fvec(self):
        h2o.beta_features = True
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data',
                                          checkExpectedResults=False)
        (ce1, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data',
                                          checkExpectedResults=True)
        (ce2, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data',
                                          checkExpectedResults=False)
        (ce3, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        print "rfv1:", h2o.dump_json(rfv1)
        print "rfv3:", h2o.dump_json(rfv3)
        # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        df = h2o_util.JsonDiff(rfv1, rfv3)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(
            ce1,
            ce2,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce2))
        self.assertAlmostEqual(
            ce1,
            ce3,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce3))

        self.assertAlmostEqual(
            ce1pct1,
            ce2pct1,
            delta=1.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce2pct1))
        self.assertAlmostEqual(
            ce1pct1,
            ce3pct1,
            delta=1.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce3pct1))
Пример #6
0
    def test_rf_big1_overwrite_model_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        print "\n" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=hex_key,
                                       timeoutSecs=15,
                                       schema='put')
        firstRfView = None
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)

            print "Change the number of trees, while keeping the rf model key name the same"
            print "Checks that we correctly overwrite previous rf model"
            if OVERWRITE_RF_MODEL:
                kwargs['ntrees'] = 1 + jobDispatch
            else:
                kwargs['ntrees'] = 1
                # don't change the seed if we're overwriting the model. It should get
                # different results just from changing the tree count
                kwargs['seed'] = random.randint(0, sys.maxint)

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runRF(node=randomNode,
                          parseResult=parseResult,
                          destination_key=model_key,
                          timeoutSecs=300,
                          noPoll=True,
                          **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['_dataKey'] = hex_key
            rfView['_key'] = model_key

            print "rf job dispatch end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            # we're going to compare rf results to previous as we go along (so we save rf view results
            h2o_jobs.pollWaitJobs(pattern='RF_model',
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)

            # In this test we're waiting after each one, so we can save the RFView results for comparison to future
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None,
                                             data_key,
                                             model_key,
                                             timeoutSecs=60,
                                             noPoll=False)
            if firstRfView is None:  # we'll use this to compare the others
                firstRfView = rfViewResult.copy()
                firstModelKey = model_key
                print "firstRfView", h2o.dump_json(firstRfView)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       firstRfView,
                                       vice_versa=True,
                                       with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)
                self.assertGreater(len(df.difference), 29,
                    msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                        (len(df.difference), h2o.dump_json(df.difference)))
Пример #7
0
    def test_exec2_na2mean(self):
        h2o.beta_features = True
        print "https://0xdata.atlassian.net/browse/PUB-228"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            'rcnt = c(0)',
            'total = c(0)',
            'mean = c(0)',
            's.hex = r.hex',
            "x=r.hex[,1]; rcnt=nrow(x)-sum(is.na(x))",
            "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x))",
            "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt",
            "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt; x=ifelse(is.na(x),mean,x)",
        ]

        execExprList2 = [
            "s.hex = apply(r.hex,2," +
                "function(x){total=sum(ifelse(is.na(x),0,x)); " + \
                "rcnt=nrow(x)-sum(is.na(x)); " + \
                "mean=total / rcnt; " + \
                "ifelse(is.na(x),mean,x)} " + \
            ")" ,
            # this got an exception. note I forgot to assign to x here
            "s=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)",
            # throw in a na flush to 0
            "x=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)",
        ]
        execExprList += execExprList2

        results = []
        for execExpr in execExprList:
            start = time.time()
            (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # unneeded but interesting 
            results.append(result)
            print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(resultExec)
            h2o.check_sandbox_for_errors()

        # compare it to summary
        rSummary = h2o_cmd.runSummary(key='r.hex', cols='0')
        h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s.hex', cols='0')
        h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        print "Comparing summary of r.hex to summary of s.hex"
        df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
        # time can be different
        print "df.difference:", h2o.dump_json(df.difference)
        self.assertLess(len(df.difference), 2)
    

        print "results from the individual exec expresssions (ignore last which was an apply)"
        print "results:", results
        self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0, 1859.0, 1859.0])
Пример #8
0
    def test_rf_big1_nopoll(self):
        csvFilename = 'hhp_107_01.data.gz'
        csvPathname = h2o.find_file("smalldata/" + csvFilename)
        key2 = csvFilename + ".hex"

        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=key2,
                                     timeoutSecs=15)
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete nopoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)
            kwargs['ntree'] = 7

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntree'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runRFOnly(node=randomNode,
                              parseKey=parseKey,
                              model_key=model_key,
                              timeoutSecs=300,
                              noPoll=False if OVERWRITE_RF_MODEL else True,
                              **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = key2
            rfView['model_key'] = model_key
            rfView['ntree'] = kwargs['ntree']
            rfViewInitial.append(rfView)

            print "rf job dispatch end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='RF_model',
                              timeoutSecs=300,
                              pollTimeoutSecs=10,
                              retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntree = rfView['ntree']
            # a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True)
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None,
                                             data_key,
                                             model_key,
                                             ntree=ntree,
                                             timeoutSecs=60,
                                             noPoll=False)
            if first is None:  # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       first,
                                       vice_versa=True,
                                       with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
Пример #9
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 10,
                'max_depth': 300,
                'nbins': 200,
                'timeoutSecs': 600,
                'response': 'C55',
            }

            paramsScoreRF = {
                'vactual': 'C55',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data': 1,
                'ntree': 10,
                'depth': 300,
                'bin_limit': 20000,
                'stat_type': 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit': 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }

        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #10
0
    def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = { 
            'seed': '1234567890',
            # if I use 100, and just one tree, I should get same results for sorted/shuffled?
            # i.e. the bagging always sees everything. Means oobe will be messed up
            # so will specify validation = the 10pct holdout data (could reuse the training data?)
            'sample_rate': 1.0,
            'ntrees': 3, 
            'max_depth': 300,
            'nbins': 200,
            'timeoutSecs': 600,
            'response': 'C55',
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # 90% data
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1)
        # self.assertEqual(4.29, classification_error1)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1)
        self.assertEqual(58101, totalScores1)

        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        # 10% data
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2)
        # self.assertEqual(4.29, classification_error2)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1)
        self.assertEqual(58101, totalScores2)

        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

      
        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
Пример #11
0
    def test_RF(self):

        paramsTrainRF = {
            'seed': '1234567890',
            'ntrees': 1,
            'max_depth': 10,
            # 'sample_rate': 1.0,
            'sample_rate': 1.0,
            'nbins': 50,
            'timeoutSecs': 600,
            'response': 'C55',
            'classification': 1,
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # train1
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult1,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult1,
                                  noPrint=False,
                                  **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult2,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult2,
                                  noPrint=False,
                                  **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception(
                "Too many diffs in JsonDiff sorted vs non-sorted %s" %
                len(df.difference))
Пример #12
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 3,
                'max_depth': 10,
                'nbins': 50,
                'timeoutSecs': 600,
                'response': 'C54',
                'classification': 1,
            }

            paramsScoreRF = {
                'vactual': 'C54',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data': 1,
                'ntree': 10,
                'depth': 300,
                'bin_limit': 20000,
                'stat_type': 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit': 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }

        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult1,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult1,
                                  noPrint=False,
                                  **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult2,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult2,
                                  noPrint=False,
                                  **kwargs)

        if 1 == 0:
            print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(trainResult1,
                                   trainResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)

            print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(scoreResult1,
                                   scoreResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)
Пример #13
0
    def test_parse_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # just do the import folder once
        importFolderPath = "libsvm"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("mnist_train.svm", "cM", 30, 0, 9.0, False, False),
            ("covtype.binary.svm", "cC", 30, 1, 2.0, True, True),
            # multi-label target like 1,2,5 ..not sure what that means
            # ("tmc2007_train.svm",  "cJ", 30, 0, 21.0, False, False),
            # illegal non-ascending cols
            # ("syn_6_1000_10.svm",  "cK", 30, -36, 36, True, False),
            # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False),
            # fails csvDownload
            ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False),
            ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False),
            ("news20.svm", "cH", 30, 1, 20.0, False, False),
            ("connect4.svm", "cB", 30, -1, 1.0, False, False),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False)
            ("gisette_scale.svm", "cF", 30, -1, 1.0, False, False),
            ("mushrooms.svm", "cG", 30, 1, 2.0, False, False),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, hex_key, timeoutSecs, expectedCol0Min,
             expectedCol0Max, enableDownloadReparse,
             enableSizeChecks) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print csvPathname, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT******************************************
            start = time.time()
            inspectFirst = h2o_cmd.runInspect(None,
                                              parseResult['destination_key'],
                                              timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspectFirst, csvFilename)
            # look at the min/max for the target col (0) and compare to expected for the dataset

            imin = float(inspectFirst['cols'][0]['min'])
            # print h2o.dump_json(inspectFirst['cols'][0])
            imax = float(inspectFirst['cols'][0]['max'])

            if expectedCol0Min:
                self.assertEqual(
                    imin,
                    expectedCol0Min,
                    msg='col %s min %s is not equal to expected min %s' %
                    (0, imin, expectedCol0Min))
            if expectedCol0Max:
                h2o_util.assertApproxEqual(
                    imax,
                    expectedCol0Max,
                    tol=0.00000001,
                    msg='col %s max %s is not equal to expected max %s' %
                    (0, imax, expectedCol0Max))

            print "\nmin/max for col0:", imin, imax

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            if DO_SUMMARY:
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0,
                    key=parseResult['destination_key'],
                    timeoutSecs=300,
                    noPrint=True)
                summaryResult = h2o_cmd.runSummary(key=hex_key,
                                                   timeoutSecs=360)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            if DO_DOWNLOAD_REPARSE and enableDownloadReparse:
                missingValuesListA = h2o_cmd.infoFromInspect(
                    inspectFirst, csvPathname)
                num_colsA = inspectFirst['num_cols']
                num_rowsA = inspectFirst['num_rows']
                row_sizeA = inspectFirst['row_size']
                value_size_bytesA = inspectFirst['value_size_bytes']

                # do a little testing of saving the key as a csv
                csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv"
                print "Trying csvDownload of", csvDownloadPathname
                h2o.nodes[0].csv_download(src_key=hex_key,
                                          csvPathname=csvDownloadPathname)

                # remove the original parsed key. source was already removed by h2o
                # don't have to now. we use a new name for hex_keyB
                # h2o.nodes[0].remove_key(hex_key)
                start = time.time()
                hex_keyB = hex_key + "_B"
                parseResultB = h2o_cmd.parseResult = h2i.import_parse(
                    path=csvDownloadPathname, schema='put', hex_key=hex_keyB)
                print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \
                    csvFilename, 'took', time.time() - start, 'seconds'
                inspect = h2o_cmd.runInspect(key=hex_keyB)

                missingValuesListB = h2o_cmd.infoFromInspect(
                    inspect, csvPathname)
                num_colsB = inspect['num_cols']
                num_rowsB = inspect['num_rows']
                row_sizeB = inspect['row_size']
                value_size_bytesB = inspect['value_size_bytes']

                df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)

                for i, d in enumerate(df.difference):
                    # ignore mismatches in these
                    #  "variance"
                    #  "response.time"
                    #  "key"
                    if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d:
                        pass
                    else:
                        raise Exception(
                            "testing %s, found unexpected mismatch in df.difference[%d]: %s"
                            % (csvPathname, i, d))

                if DO_SIZE_CHECKS and enableSizeChecks:
                    # if we're allowed to do size checks. ccompare the full json response!
                    print "Comparing original inspect to the inspect after parsing the downloaded csv"
                    # vice_versa=True

                    # ignore the variance diffs. reals mismatch when they're not?
                    filtered = [
                        v for v in df.difference if not 'variance' in v
                    ]
                    self.assertLess(len(filtered), 3,
                        msg="Want < 3, not %d differences between the two rfView json responses. %s" % \
                            (len(filtered), h2o.dump_json(filtered)))

                    # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes
                    # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen
                    # make the check conditional based on the dataset
                    self.assertEqual(
                        row_sizeA, row_sizeB,
                        "row_size mismatches after re-parse of downloadCsv result %d %d"
                        % (row_sizeA, row_sizeB))
                    h2o_util.assertApproxEqual(
                        value_size_bytesA,
                        value_size_bytesB,
                        tol=0.00000001,
                        msg=
                        "value_size_bytes mismatches after re-parse of downloadCsv result %d %d"
                        % (value_size_bytesA, value_size_bytesB))

                print "missingValuesListA:", missingValuesListA
                print "missingValuesListB:", missingValuesListB
                self.assertEqual(
                    missingValuesListA, missingValuesListB,
                    "missingValuesList mismatches after re-parse of downloadCsv result"
                )
                self.assertEqual(
                    num_colsA, num_colsB,
                    "num_cols mismatches after re-parse of downloadCsv result %d %d"
                    % (num_colsA, num_colsB))
                self.assertEqual(
                    num_rowsA, num_rowsB,
                    "num_rows mismatches after re-parse of downloadCsv result %d %d"
                    % (num_rowsA, num_rowsB))

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Пример #14
0
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"

        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=hex_key,
                                       timeoutSecs=15,
                                       schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            model_key = ""
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'SRF_model'
            else:
                model_key = 'SRF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)
            kwargs['response'] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runSpeeDRF(node=randomNode,
                               parseResult=parseResult,
                               destination_key=model_key,
                               timeoutSecs=300,
                               noPoll=False,
                               **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            print "\n MODEL KEY: ", model_key
            rfViewInitial.append(
                h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60))

        # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            model_key = rfView["speedrf_model"]['_key']
            ntree = rfView["speedrf_model"]["parameters"]['ntrees']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None,
                                                  model_key,
                                                  timeoutSecs=60)
            if first is None:  # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       first,
                                       vice_versa=True,
                                       with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
Пример #15
0
    def test_rf_covtype_train_oobe_fvec(self):
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data',
                                          checkExpectedResults=False,
                                          expectedAuc=0.95)
        (ce1, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data',
                                          checkExpectedResults=True,
                                          expectedAuc=0.95)
        (ce2, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data',
                                          checkExpectedResults=False,
                                          expectedAuc=0.95)
        (ce3, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        print "rfv1:", h2o.dump_json(rfv1)
        print "rfv3:", h2o.dump_json(rfv3)
        # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        df = h2o_util.JsonDiff(rfv1, rfv3)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(
            ce1,
            ce2,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce2))
        self.assertAlmostEqual(
            ce1,
            ce3,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce3))

        # we're doing separate test/train splits..so we're going to get variance
        # really should not do test/train split and use all the data? if we're comparing sorted or not?
        # but need the splits to be sorted or not. I think I have those files
        self.assertAlmostEqual(
            ce1pct1,
            ce2pct1,
            delta=10.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce2pct1))
        self.assertAlmostEqual(
            ce1pct1,
            ce3pct1,
            delta=10.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce3pct1))