Пример #1
0
    def test_expr_rpy2(self):

        for k in range(20):
            a = random.randint(1, 10)
            # b = random.randint(49,50)
            b = random.randint(1, 10)
            c = random.randint(0, 3)
            for k in range(50):
                execExpr = "a=" + str(h2o_eqns.Expression(a, b, c)) + ";"
                (resultExec, hResult) = h2e.exec_expr(execExpr=execExpr)
                print "h2o:", hResult

                rResult = robjects.r(execExpr)[0]
                print "R:", rResult

                if math.isinf(rResult):
                    # covers pos/neg inf?
                    if not 'Infinity' in str(hResult):
                        raise Exception("h2o: %s R: %s not equal" %
                                        (hResult, rResult))
                elif math.isnan(rResult):
                    if not 'NaN' in str(hResult):
                        raise Exception("h2o: %s R: %s not equal" %
                                        (hResult, rResult))
                elif 'Infinity' in str(hResult) or 'NaN' in str(hResult):
                    raise Exception("h2o: %s R: %s not equal" %
                                    (hResult, rResult))
                else:
                    # skip Inf
                    # don't do logicals..h2o 1/0, R True/False
                    h2o_util.approxEqual(
                        rResult,
                        hResult,
                        tol=1e-12,
                        msg='mismatch h2o/R expression result')
Пример #2
0
    def test_expr_rpy2(self):
        h2o.beta_features = True

        for k in range(20):
            a = random.randint(1, 10)
            # b = random.randint(49,50)
            b = random.randint(1, 10)
            c = random.randint(0, 3)
            for k in range(50):
                execExpr = "a=" + str(h2o_eqns.Expression(a, b, c)) + ";"
                (resultExec, hResult) = h2e.exec_expr(execExpr=execExpr)
                print "h2o:", hResult

                rResult = robjects.r(execExpr)[0]
                print "R:", rResult

                if math.isinf(rResult):
                    # covers pos/neg inf?
                    if not "Infinity" in str(hResult):
                        raise Exception("h2o: %s R: %s not equal" % (hResult, rResult))
                elif math.isnan(rResult):
                    if not "NaN" in str(hResult):
                        raise Exception("h2o: %s R: %s not equal" % (hResult, rResult))
                elif "Infinity" in str(hResult) or "NaN" in str(hResult):
                    raise Exception("h2o: %s R: %s not equal" % (hResult, rResult))
                else:
                    # skip Inf
                    # don't do logicals..h2o 1/0, R True/False
                    h2o_util.approxEqual(rResult, hResult, tol=1e-12, msg="mismatch h2o/R expression result")
Пример #3
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            ('cars.csv', 'c.hex', [
                (None, None,None,None,None,None),
                ('economy (mpg)', None,None,None,None,None),
                ('cylinders', None,None,None,None,None),
            ],
            ),
            ('runifA.csv', 'A.hex', [
                (None,  1.00, 25.00, 50.00, 75.00, 100.0),
                ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
            ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            ('runif.csv', 'x.hex', [
                (None,  1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
            ],
            ),
            ('runifB.csv', 'B.hex', [
                (None,  1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                ('x', -100.00, -50.1, 0.974, 51.7, 100,00),
            ],
            ),

            ('runifC.csv', 'C.hex', [
                (None,  1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
            ],
            ),
        ]


        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname,
                schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):",  h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
                    pctile = stats['pctile']


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0
                    

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname!='' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                        if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'):
                            raise Exception("stopping to look")
                                


                scipyCol += 1

            trial += 1
Пример #4
0
    def test_GLM1_GLM2_predict(self):
        # h2b.browseTheCloud()
        h2o.beta_features = False
        SYNDATASETS_DIR = h2o.make_syn_dir()

        trees = 15
        timeoutSecs = 120
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1==0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            y = 54
            expectedPctWrong = 0
        
        if 1==0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            y = 54
            expectedPctWrong = 0

        if 1==1:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'smalldata'
            # no header
            csvPathname = 'iris/iris.csv'
            hexKey = 'iris.hex'
            y = 4
            expectedPctWrong = 26
            

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True)

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        h2o_cmd.runSummary(key=hexKey)

        # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
        trainKey = parseResult['destination_key']

        # just to check. are there any NA/constant cols?
        ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

        #**************************************************************************
        # first glm1
        h2o.beta_features = False
        CLASS = 1
        # try ignoring the constant col to see if it makes a diff
        kwargs = {
            'lsm_solver': LSM_SOLVER,
            'standardize': STANDARDIZE,
            'y': 'C' + str(y+1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON,
            'case': CLASS,
            'case_mode': '=',
         }

        timeoutSecs = 120
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right
        glm['GLMModel']['GLMParams']['family'] = FAMILY
        print "glm1 end on ", csvPathname, 'took', time.time() - start, 'seconds'
        (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
        iterations1 = glm['GLMModel']['iterations']
        err1 = glm['GLMModel']['validations'][0]['err']
        nullDev1 = glm['GLMModel']['validations'][0]['nullDev']
        resDev1 = glm['GLMModel']['validations'][0]['resDev']

        if FAMILY == 'binomial':
            classErr1 = glm['GLMModel']['validations'][0]['classErr']
            auc1 = glm['GLMModel']['validations'][0]['auc']

        #**************************************************************************
        # then glm2
        h2o.beta_features = True
        kwargs = {
            # 'ignored_cols': 'C29',
            'standardize': STANDARDIZE,
            'response': 'C' + str(y+1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON}

        timeoutSecs = 120

        # class 1=1, all else 0
        if FAMILY == 'binomial':
            execExpr="B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % (trainKey, y+1, trainKey, y+1, CLASS)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            bHack = {'destination_key': 'B.hex'}
        else:
            bHack = parseResult
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})

#        kwargs.update({'alpha': 0.0, 'lambda': 0})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # bad model (auc=0.5)
        # kwargs.update({'alpha': 0.0, 'lambda': 0.0})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs)
        print "glm2 end on ", csvPathname, 'took', time.time() - start, 'seconds'
        (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        #**************************************************************************
        modelKey = glm['glm_model']['_key']
        submodels = glm['glm_model']['submodels']
        # hackery to make it work when there's just one
        validation = submodels[-1]['validation']
        iteration = submodels[-1]['iteration']

        resDev = validation['residual_deviance']
        nullDev = validation['null_deviance']
        if FAMILY == 'binomial':
            auc = validation['auc']

        self.assertLess(iterations1, MAX_ITER-1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) 
        self.assertLess(iteration, MAX_ITER-1, msg="GLM2: Too many iterations, didn't converge %s" % iteration)

        nullDevExpected = nullDev1
        # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, 
        #     msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected))

        iterationExpected = iterations1
        # self.assertAlmostEqual(iteration, iterationExpected, delta=2, 
        #     msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected))


        # coefficients is a list.
        coeff0 = coefficients[0]
        coeff0Expected = coefficients1[0]
        print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected))/abs(coeff0Expected))
        self.assertTrue(h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5),
            msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected))

        
        coeff2 = coefficients[2]
        coeff2Expected = coefficients1[2]
        print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected))/abs(coeff2Expected))
        self.assertTrue(h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5),
            msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected))

        # compare to known values GLM1 got for class 1 case, with these parameters
        # aucExpected = 0.8428
        if FAMILY == 'binomial':
            aucExpected = auc1
            self.assertAlmostEqual(auc, aucExpected, delta=10, 
                msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected))

        interceptExpected = intercept1
        print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected))/abs(interceptExpected)
        self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5),
            msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected))


        # avg_errExpected = 0.2463
        avg_errExpected = err1
        # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, 
        #     msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected))

        # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, 
        #     msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35))

        #********************
        # Print comparison
        #********************
        interceptDelta = abs(abs(intercept1) - abs(intercept))
        cDelta = [abs(abs(a) - abs(b)) for a,b in zip(coefficients1, coefficients)]

        def printit(self, a, b, c, d):
            pctDiff = abs(d/c)*100
            print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \
                ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d)
            # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept")

        printit(self, "intercept", "", intercept1, interceptDelta)
        print "compare lengths coefficients1, coefficients, cDelta:", len(coefficients1), len(coefficients), len(cDelta)
        print "GLM1:", coefficients1
        print "GLM2:", coefficients
        print "cDelta:", cDelta

        for i,cValue in enumerate(coefficients):
            printit(self , "coefficient", "C"+str(i), cValue, cDelta[i])

        hexKey = 'B.hex'
        pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, 
            csvSrcOutputPathname, csvPredictPathname, 
            skipSrcOutputHeader, skipPredictHeader,
            translate=None, y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, 
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 2.0,
            msg="predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
Пример #5
0
    def test_impute_with_na(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       schema='local',
                                       timeoutSecs=20)

        print "Just insert some NAs and see what happens"
        inspect = h2o_cmd.runInspect(key=hex_key)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        missing_fraction = 0.1

        # NOT ALLOWED TO SET AN ENUM COL?
        if 1 == 0:
            # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec?
            # just one in row 1
            for enumCol in enumColList:
                print "hack: Putting NA in row 0 of col %s" % enumCol
                execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after exec:", missingValuesList
            if len(missingValuesList) != len(enumColList):
                raise Exception(
                    "Didn't get missing values in expected number of cols: %s %s"
                    % (enumColList, missingValuesList))

        for trial in range(5):
            # copy the dataset
            hex_key2 = 'c.hex'
            execExpr = '%s = %s' % (hex_key2, hex_key)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            imvResult = h2o.nodes[0].insert_missing_values(
                key=hex_key2, missing_fraction=missing_fraction, seed=SEED)
            print "imvResult", h2o.dump_json(imvResult)
            # maybe make the output col a factor column
            # maybe one of the 0,1 cols too?
            # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns.
            # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3)

            print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before"
            expectedMissing = missing_fraction * origNumRows  # per col
            enumColList = [49, 50, 51, 52, 53, 54]
            for e in enumColList:
                enumResult = h2o.nodes[0].to_enum(src_key=hex_key2,
                                                  column_index=(e + 1))

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(origNumRows, numRows)
            self.assertEqual(origNumCols, numCols)

            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList", missingValuesList
            if len(missingValuesList) != numCols:
                raise Exception(
                    "Why is missingValuesList not right afer ToEnum2?: %s %s" %
                    (enumColList, missingValuesList))

            for mv in missingValuesList:
                self.assertAlmostEqual(mv,
                                       expectedMissing,
                                       delta=0.1 * mv,
                                       msg='mv %s is not approx. expected %s' %
                                       (mv, expectedMissing))

            summaryResult = h2o_cmd.runSummary(key=hex_key2)
            h2o_cmd.infoFromSummary(summaryResult)
            # h2o_cmd.infoFromSummary(summaryResult)

            print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect"
            print "trial", trial
            print "expectedMissing:", expectedMissing

            print "Now get rid of all the missing values, but imputing means. We know all columns should have NAs from above"
            print "Do the columns in random order"

            # don't do the enum cols ..impute doesn't support right?
            if AVOID_BUG:
                shuffledColList = range(0, 49)  # 0 to 48
                execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
                # summaryResult = h2o_cmd.runSummary(key=hex_key2)
                # h2o_cmd.infoFromSummary(summaryResult)
                inspect = h2o_cmd.runInspect(key=hex_key2)
                numCols = inspect['numCols']
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                print "missingValuesList after impute:", missingValuesList
                if len(missingValuesList) != 49:
                    raise Exception(
                        "expected missing values in all cols after pruning enum cols: %s"
                        % missingValuesList)
            else:
                shuffledColList = range(0, 55)  # 0 to 54

            origInspect = inspect
            random.shuffle(shuffledColList)

            for column in shuffledColList:
                # get a random set of column. no duplicate. random order? 0 is okay? will be []
                groupBy = random.sample(range(55), random.randint(0, 54))
                # header names start with 1, not 0. Empty string if []
                groupByNames = ",".join(
                    map(lambda x: "C" + str(x + 1), groupBy))

                # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap
                columnName = "C%s" % (column + 1)
                print "don't use mode if col isn't enum"
                badChoices = True
                while badChoices:
                    method = random.choice(["mean", "median", "mode"])
                    badChoices = column not in enumColList and method == "mode"

                NEWSEED = random.randint(0, sys.maxint)
                print "does impute modify the source key?"
                # we get h2o error (argument exception) if no NAs
                impResult = h2o.nodes[0].impute(source=hex_key2,
                                                column=column,
                                                method=method)

            print "Now check that there are no missing values"
            print "FIX! broken..insert missing values doesn't insert NAs in enum cols"

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows2 = inspect['numRows']
            numCols2 = inspect['numCols']
            self.assertEqual(
                numRows, numRows2,
                "imput shouldn't have changed frame numRows: %s %s" %
                (numRows, numRows2))

            self.assertEqual(
                numCols, numCols2,
                "imput shouldn't have changed frame numCols: %s %s" %
                (numCols, numCols2))

            # check that the mean didn't change for the col
            # the enum cols with mode, we'll have to think of something else

            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after impute:", missingValuesList
            if missingValuesList:
                raise Exception(
                    "Not expecting any missing values after imputing all cols: %s"
                    % missingValuesList)

            cols = inspect['cols']
            origCols = origInspect['cols']
            for i, (c, oc) in enumerate(zip(cols, origCols)):
                # I suppose since we impute to either median or mean, we can't assume the mean stays the same
                # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true
                h2o_util.approxEqual(
                    c['mean'],
                    oc['mean'],
                    tol=0.000000001,
                    msg=
                    "col %i original mean: %s not equal to mean after impute: %s"
                    % (i, c['mean'], oc['mean']))
Пример #6
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            (
                'cars.csv',
                'c.hex',
                [
                    (None, None, None, None, None, None),
                    ('economy (mpg)', None, None, None, None, None),
                    ('cylinders', None, None, None, None, None),
                ],
            ),
            (
                'runifA.csv',
                'A.hex',
                [
                    (None, 1.00, 25.00, 50.00, 75.00, 100.0),
                    ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
                ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            (
                'runif.csv',
                'x.hex',
                [
                    (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                    ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                    ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                    ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
                ],
            ),
            (
                'runifB.csv',
                'B.hex',
                [
                    (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                    ('x', -100.00, -50.1, 0.974, 51.7, 100, 00),
                ],
            ),
            (
                'runifC.csv',
                'C.hex',
                [
                    (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                    ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
                ],
            ),
        ]

        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata',
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname,
                                     expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(
                    source_key=hex_key,
                    column=column['colname'],
                    quantile=quantile,
                    max_qbins=MAX_QBINS,
                    multiple_pass=2,
                    interpolation_type=7)  # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:",
                               q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype = stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype != 'Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                        mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                        sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct = [
                        0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9,
                        0.95, 0.99
                    ]
                    pctile = stats['pctile']

                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange / (MAX_QBINS - 2)
                    maxErr = 0.5 * expectedBin  # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(
                        mins[0],
                        expected[1],
                        tol=maxErr,
                        msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(
                        pctile[3],
                        expected[2],
                        tol=maxErr,
                        msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(
                        pctile[5],
                        expected[3],
                        tol=maxErr,
                        msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(
                        pctile[7],
                        expected[4],
                        tol=maxErr,
                        msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(
                        maxs[0],
                        expected[5],
                        tol=maxErr,
                        msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype != 'Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname != '' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                        )

                        if False and h2o_util.approxEqual(pctile[5],
                                                          0.990238116744,
                                                          tol=0.002,
                                                          msg='stop here'):
                            raise Exception("stopping to look")

                scipyCol += 1

            trial += 1
Пример #7
0
    def test_GLM1_GLM2_predict(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        trees = 15
        timeoutSecs = 120
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            y = 54
            expectedPctWrong = 0

        if 1 == 0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            y = 54
            expectedPctWrong = 0

        if 1 == 1:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'smalldata'
            # no header
            csvPathname = 'iris/iris.csv'
            hexKey = 'iris.hex'
            y = 4
            expectedPctWrong = 26

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        h2o_cmd.runSummary(key=hexKey)

        # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
        trainKey = parseResult['destination_key']

        # just to check. are there any NA/constant cols?
        ignore_x = h2o_glm.goodXFromColumnInfo(
            y, key=parseResult['destination_key'], timeoutSecs=300)

        #**************************************************************************
        # first glm1
        CLASS = 1
        # try ignoring the constant col to see if it makes a diff
        kwargs = {
            'lsm_solver': LSM_SOLVER,
            'standardize': STANDARDIZE,
            'y': 'C' + str(y + 1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON,
            'case': CLASS,
            'case_mode': '=',
        }

        timeoutSecs = 120
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult,
                             timeoutSecs=timeoutSecs,
                             **kwargs)
        # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right
        glm['GLMModel']['GLMParams']['family'] = FAMILY
        print "glm1 end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        (warnings, coefficients1,
         intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
        iterations1 = glm['GLMModel']['iterations']
        err1 = glm['GLMModel']['validations'][0]['err']
        nullDev1 = glm['GLMModel']['validations'][0]['nullDev']
        resDev1 = glm['GLMModel']['validations'][0]['resDev']

        if FAMILY == 'binomial':
            classErr1 = glm['GLMModel']['validations'][0]['classErr']
            auc1 = glm['GLMModel']['validations'][0]['auc']

        #**************************************************************************
        # then glm2
        kwargs = {
            # 'ignored_cols': 'C29',
            'standardize': STANDARDIZE,
            'response': 'C' + str(y + 1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON
        }

        timeoutSecs = 120

        # class 1=1, all else 0
        if FAMILY == 'binomial':
            execExpr = "B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % (
                trainKey, y + 1, trainKey, y + 1, CLASS)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            bHack = {'destination_key': 'B.hex'}
        else:
            bHack = parseResult
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})

        #        kwargs.update({'alpha': 0.0, 'lambda': 0})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # bad model (auc=0.5)
        # kwargs.update({'alpha': 0.0, 'lambda': 0.0})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=bHack,
                             timeoutSecs=timeoutSecs,
                             **kwargs)
        print "glm2 end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        (warnings, coefficients,
         intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        #**************************************************************************
        modelKey = glm['glm_model']['_key']
        submodels = glm['glm_model']['submodels']
        # hackery to make it work when there's just one
        validation = submodels[-1]['validation']
        iteration = submodels[-1]['iteration']

        resDev = validation['residual_deviance']
        nullDev = validation['null_deviance']
        if FAMILY == 'binomial':
            auc = validation['auc']

        self.assertLess(iterations1,
                        MAX_ITER - 1,
                        msg="GLM1: Too many iterations, didn't converge %s" %
                        iterations1)
        self.assertLess(iteration,
                        MAX_ITER - 1,
                        msg="GLM2: Too many iterations, didn't converge %s" %
                        iteration)

        nullDevExpected = nullDev1
        # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2,
        #     msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected))

        iterationExpected = iterations1
        # self.assertAlmostEqual(iteration, iterationExpected, delta=2,
        #     msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected))

        # coefficients is a list.
        coeff0 = coefficients[0]
        coeff0Expected = coefficients1[0]
        print "coeff0 pct delta:", "%0.3f" % (
            100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected))
        self.assertTrue(
            h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5),
            msg='GLM2 coefficient 0 %s is too different from GLM1 %s' %
            (coeff0, coeff0Expected))

        coeff2 = coefficients[2]
        coeff2Expected = coefficients1[2]
        print "coeff2 pct delta:", "%0.3f" % (
            100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected))
        self.assertTrue(
            h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5),
            msg='GLM2 coefficient 2 %s is too different from GLM1 %s' %
            (coeff2, coeff2Expected))

        # compare to known values GLM1 got for class 1 case, with these parameters
        # aucExpected = 0.8428
        if FAMILY == 'binomial':
            aucExpected = auc1
            self.assertAlmostEqual(
                auc,
                aucExpected,
                delta=10,
                msg='GLM2 auc %s is too different from GLM1 %s' %
                (auc, aucExpected))

        interceptExpected = intercept1
        print "intercept pct delta:", 100.0 * (
            abs(intercept) - abs(interceptExpected)) / abs(interceptExpected)
        self.assertTrue(h2o_util.approxEqual(intercept,
                                             interceptExpected,
                                             rel=0.5),
                        msg='GLM2 intercept %s is too different from GLM1 %s' %
                        (intercept, interceptExpected))

        # avg_errExpected = 0.2463
        avg_errExpected = err1
        # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected,
        #     msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected))

        # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold,
        #     msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35))

        #********************
        # Print comparison
        #********************
        interceptDelta = abs(abs(intercept1) - abs(intercept))
        cDelta = [
            abs(abs(a) - abs(b)) for a, b in zip(coefficients1, coefficients)
        ]

        def printit(self, a, b, c, d):
            pctDiff = abs(d / c) * 100
            print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \
                ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d)
            # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept")

        printit(self, "intercept", "", intercept1, interceptDelta)
        print "compare lengths coefficients1, coefficients, cDelta:", len(
            coefficients1), len(coefficients), len(cDelta)
        print "GLM1:", coefficients1
        print "GLM2:", coefficients
        print "cDelta:", cDelta

        for i, cValue in enumerate(coefficients):
            printit(self, "coefficient", "C" + str(i), cValue, cDelta[i])

        hexKey = 'B.hex'
        pctWrong = h2o_rf.predict_and_compare_csvs(modelKey,
                                                   hexKey,
                                                   predictHexKey,
                                                   csvSrcOutputPathname,
                                                   csvPredictPathname,
                                                   skipSrcOutputHeader,
                                                   skipPredictHeader,
                                                   translate=None,
                                                   y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        self.assertAlmostEqual(
            pctWrong,
            expectedPctWrong,
            delta=2.0,
            msg=
            "predicted pctWrong: %s should be small because we're predicting with training data %s"
            % (pctWrong, expectedPctWrong))
Пример #8
0
    def test_many_fp_formats_libsvm_2_fvec(self):
        #h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            # (100, 40000, 'cC', 300, 'sparse50'),
            # (100, 40000, 'cD', 300, 'sparse'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax,
                 synColSumDict) = write_syn_dataset(csvPathname, rowCount,
                                                    colCount, SEEDPERFILE, sel,
                                                    distribution)

                selKey2 = hex_key + "_" + str(sel)
                print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically"
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=selKey2,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False,
                                               parser_type='SVMLight')
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             max_column_display=colNumberMax +
                                             1,
                                             timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0,
                    key=parseResult['destination_key'],
                    timeoutSecs=300,
                    noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(
                        key=selKey2,
                        max_column_display=colNumberMax + 1,
                        timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, numCols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(
                        None,
                        exprList,
                        selKey2,
                        maxCol=colNumberMax + 1,
                        timeoutSecs=timeoutSecs,
                        print_params=False)
                    #print "\n*************"
                    #print "colResultList", colResultList
                    #print "*************"

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset

                sortedColSumDict = OrderedDict(sorted(synColSumDict.items()))
                print sortedColSumDict
                for k, v in sortedColSumDict.iteritems():
                    print k
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k >= 0 and k < len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(
                            v,
                            compare,
                            places=0,
                            msg='%0.6f col sum is not equal to expected %0.6f'
                            % (v, compare))

                    synMean = (v + 0.0) / rowCount
                    # enums don't have mean, but we're not enums
                    mean = float(inspect['cols'][k]['mean'])
                    # our fp formats in the syn generation sometimes only have two places?
                    if not h2o_util.approxEqual(mean, synMean, tol=1e-3):
                        execExpr = 'sum(%s[,%s])' % (selKey2, k + 1)
                        resultExec = h2o_cmd.runExec(str=execExpr,
                                                     timeoutSecs=300)
                        print "Result of exec sum on failing col:..:", k, h2o.dump_json(
                            resultExec)
                        print "Result of remembered sum on failing col:..:", k, v
                        print "Result of inspect mean * rowCount on failing col..:", mean * rowCount
                        print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean
                        sys.stdout.flush()
                        raise Exception(
                            'col %s mean %0.6f is not equal to generated mean %0.6f'
                            % (k, mean, synMean))

                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0,
                                     naCnt,
                                     msg='col %s naCnt %d should be 0' %
                                     (k, naCnt))
    def test_many_fp_formats_libsvm_2_fvec(self):
        #h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            # (100, 40000, 'cC', 300, 'sparse50'),
            # (100, 40000, 'cD', 300, 'sparse'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = hex_key + "_" + str(sel)
                print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically"
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, 
                    timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight')
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseResult['destination_key'], timeoutSecs=300, noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, numCols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                        timeoutSecs=timeoutSecs, print_params=False)
                    #print "\n*************"
                    #print "colResultList", colResultList
                    #print "*************"

                self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset

                sortedColSumDict = OrderedDict(sorted(synColSumDict.items()))
                print sortedColSumDict
                for k,v in sortedColSumDict.iteritems():
                    print k
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k>=0 and k<len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(v, compare, places=0, 
                            msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare))

                    synMean = (v + 0.0)/rowCount
                    # enums don't have mean, but we're not enums
                    mean = float(inspect['cols'][k]['mean'])
                    # our fp formats in the syn generation sometimes only have two places?
                    if not h2o_util.approxEqual(mean, synMean, tol=1e-3):
                        execExpr = 'sum(%s[,%s])' % (selKey2, k+1)
                        resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) 
                        print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec)
                        print "Result of remembered sum on failing col:..:", k, v
                        print "Result of inspect mean * rowCount on failing col..:", mean * rowCount
                        print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean
                        sys.stdout.flush()
                        raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))

                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0, naCnt,
                        msg='col %s naCnt %d should be 0' % (k, naCnt))
Пример #10
0
    def test_impute_with_na(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20)

        print "Just insert some NAs and see what happens"
        inspect = h2o_cmd.runInspect(key=hex_key)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        missing_fraction = 0.1



        # NOT ALLOWED TO SET AN ENUM COL?
        if 1==0:
            # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec?
            # just one in row 1
            for enumCol in enumColList:
                print "hack: Putting NA in row 0 of col %s" % enumCol
                execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after exec:", missingValuesList
            if len(missingValuesList) != len(enumColList):
                raise Exception ("Didn't get missing values in expected number of cols: %s %s" % (enumColList, missingValuesList))


        for trial in range(5):
            # copy the dataset
            hex_key2 = 'c.hex'
            execExpr = '%s = %s' % (hex_key2, hex_key)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            imvResult = h2o.nodes[0].insert_missing_values(key=hex_key2, missing_fraction=missing_fraction, seed=SEED)
            print "imvResult", h2o.dump_json(imvResult)
            # maybe make the output col a factor column
            # maybe one of the 0,1 cols too? 
            # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns.
            # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3)

            print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before"
            expectedMissing = missing_fraction * origNumRows # per col
            enumColList = [49, 50, 51, 52, 53, 54] 
            for e in enumColList:
                enumResult = h2o.nodes[0].to_enum(src_key=hex_key2, column_index=(e+1))

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(origNumRows, numRows)
            self.assertEqual(origNumCols, numCols)

            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList", missingValuesList
            if len(missingValuesList) != numCols:
                raise Exception ("Why is missingValuesList not right afer ToEnum2?: %s %s" % (enumColList, missingValuesList))

            for mv in missingValuesList:
                self.assertAlmostEqual(mv, expectedMissing, delta=0.1 * mv, msg='mv %s is not approx. expected %s' % (mv, expectedMissing))

            summaryResult = h2o_cmd.runSummary(key=hex_key2)
            h2o_cmd.infoFromSummary(summaryResult)
            # h2o_cmd.infoFromSummary(summaryResult)

            print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect"
            print "trial", trial
            print "expectedMissing:", expectedMissing

            print "Now get rid of all the missing values, but imputing means. We know all columns should have NAs from above"
            print "Do the columns in random order"


            # don't do the enum cols ..impute doesn't support right?
            if AVOID_BUG:
                shuffledColList = range(0,49) # 0 to 48
                execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
                # summaryResult = h2o_cmd.runSummary(key=hex_key2)
                # h2o_cmd.infoFromSummary(summaryResult)
                inspect = h2o_cmd.runInspect(key=hex_key2)
                numCols = inspect['numCols']
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                print "missingValuesList after impute:", missingValuesList
                if len(missingValuesList) != 49:
                    raise Exception ("expected missing values in all cols after pruning enum cols: %s" % missingValuesList)
            else:
                shuffledColList = range(0,55) # 0 to 54
            
            origInspect = inspect
            random.shuffle(shuffledColList)

            for column in shuffledColList: 
                # get a random set of column. no duplicate. random order? 0 is okay? will be []
                groupBy = random.sample(range(55), random.randint(0, 54))
                # header names start with 1, not 0. Empty string if []
                groupByNames = ",".join(map(lambda x: "C" + str(x+1), groupBy))

                # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap
                columnName = "C%s" % (column + 1) 
                print "don't use mode if col isn't enum"
                badChoices = True
                while badChoices:
                    method = random.choice(["mean", "median", "mode"])
                    badChoices = column not in enumColList and method=="mode"

                NEWSEED = random.randint(0, sys.maxint)
                print "does impute modify the source key?"
                # we get h2o error (argument exception) if no NAs
                impResult = h2o.nodes[0].impute(source=hex_key2, column=column, method=method)

            print "Now check that there are no missing values"
            print "FIX! broken..insert missing values doesn't insert NAs in enum cols"

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows2 = inspect['numRows']
            numCols2 = inspect['numCols']
            self.assertEqual(numRows, numRows2, "imput shouldn't have changed frame numRows: %s %s" % (numRows, numRows2))

            
            self.assertEqual(numCols, numCols2, "imput shouldn't have changed frame numCols: %s %s" % (numCols, numCols2))


            # check that the mean didn't change for the col
            # the enum cols with mode, we'll have to think of something else

            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after impute:", missingValuesList
            if missingValuesList:
                raise Exception ("Not expecting any missing values after imputing all cols: %s" % missingValuesList)
            
            cols = inspect['cols']
            origCols = origInspect['cols']
            for i, (c, oc) in enumerate(zip(cols, origCols)):
                # I suppose since we impute to either median or mean, we can't assume the mean stays the same
                # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true
                h2o_util.approxEqual(c['mean'], oc['mean'], tol=0.000000001, 
                    msg="col %i original mean: %s not equal to mean after impute: %s" % (i, c['mean'], oc['mean']))