예제 #1
0
파일: test_GLM2.py 프로젝트: zhuyuecai/h2o
    def process_dataset(self, parseResult, Y, e_coefs, e_ndev, e_rdev, e_aic, **kwargs):
        # no regularization
        kwargs['alpha'] = 0
        kwargs['lambda'] = 0
        kwargs['response'] = 'CAPSULE'
        glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, **kwargs)

        (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs)
        cstring = "".join([("%.5e  " % c) for c in clist])
        h2p.green_print("h2o coefficient list:", cstring)
        h2p.green_print("h2o intercept", "%.5e  " %  intercept)

        # other stuff in the json response

        # the first submodel is the right one, if onely one lambda is provided as a parameter above
        glm_model = glmResult['glm_model']
        submodels = glm_model['submodels'][0]
        validation = submodels['validation']
        null_deviance = validation['null_deviance']
        residual_deviance = validation['residual_deviance']

        errors = []
        # FIX! our null deviance doesn't seem to match
        h2o.verboseprint("Comparing:", null_deviance, e_ndev)
        # if abs(float(nullDev) - e_ndev) > (0.001 * e_ndev): 
        #    errors.append('NullDeviance: %f != %s' % (e_ndev,nullDev))

        # FIX! our res deviance doesn't seem to match
        h2o.verboseprint("Comparing:", residual_deviance, e_rdev)
        # if abs(float(resDev) - e_rdev) > (0.001 * e_rdev): 
        #    errors.append('ResDeviance: %f != %s' % (e_rdev,resDev))

        # FIX! we don't have an AIC to compare?
        return errors
예제 #2
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 

        h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime)
        if CHECK_WHILE_SLEEPING:        
            h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime)

        h2p.red_print("No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long")
        h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...")


        while (totalTime<maxTime): # die after 4 hours
            time.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)

            ### h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:        
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1==0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
예제 #3
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 

        h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime)
        if CHECK_WHILE_SLEEPING:        
            h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime)

        h2p.red_print("No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long")
        h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...")


        while (totalTime<maxTime): # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:        
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(datetime.datetime.now()), h2o.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1==0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
예제 #4
0
파일: cloud.py 프로젝트: zjliang/h2o-2
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4 * 3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600,
                         "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(
            h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!")
        h2p.blue_print(
            "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs"
            % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime < maxTime):  # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
예제 #5
0
파일: cloud.py 프로젝트: 100star/h2o
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 
        h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime<maxTime): # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
예제 #6
0
def do_scipy_glm(self, bucket, csvPathname, L, family='binomial'):
    
    h2p.red_print("Now doing sklearn")
    h2p.red_print("\nsee http://scikit-learn.org/0.11/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression")

    import numpy as np
    import scipy as sp
    from sklearn.linear_model import LogisticRegression
    from numpy import loadtxt

    csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

    # make sure it does fp divide
    C = 1/(L+0.0)
    print "C regularization:", C
    dataset = np.loadtxt( 
        open(csvPathnameFull,'r'),
        skiprows=1, # skip the header
        delimiter=',',
        dtype='float');

    print "\ncsv read for training, done"

    n_features = len(dataset[0]) - 1;
    print "n_features:", n_features

    # don't want ID (col 0) or CAPSULE (col 1)
    # get CAPSULE
    target = [x[1] for x in dataset]
    # slice off the first 2
    train  = np.array ( [x[2:] for x in dataset] )


    n_samples, n_features = train.shape
    print "n_samples:", n_samples, "n_features:",  n_features

    print "histogram of target"
    print sp.histogram(target,3)

    print "len(train):",  len(train)
    print "len(target):", len(target)
    print "dataset shape:", dataset.shape

    if family!='binomial':
        raise Exception("Only have binomial logistic for scipy")
    print "\nTrying l2"
    clf2 = LogisticRegression(
        C=C,
        dual=False, 
        fit_intercept=True, 
        intercept_scaling=1, 
        penalty='l2', 
        tol=0.0001);

    # train the classifier
    start = time.time()
    clf2.fit(train, target)
    print "L2 fit took", time.time() - start, "seconds"

    # print "coefficients:", clf2.coef_
    cstring = "".join([("%.5e  " % c) for c in clf2.coef_[0]])
    h2p.green_print("sklearn L2 C", C)
    h2p.green_print("sklearn coefficients:", cstring)
    h2p.green_print("sklearn intercept:", "%.5e" % clf2.intercept_[0])
    h2p.green_print("sklearn score:", clf2.score(train,target))

    print "\nTrying l1"
    clf1 = LogisticRegression(
        C=C,
        dual=False, 
        fit_intercept=True, 
        intercept_scaling=1, 
        penalty='l1', 
        tol=0.0001);

    # train the classifier
    start = time.time()
    clf1.fit(train, target)
    print "L1 fit took", time.time() - start, "seconds"

    # print "coefficients:", clf1.coef_
    cstring = "".join([("%.5e  " % c) for c in clf1.coef_[0]])
    h2p.green_print("sklearn L1 C", C)
    h2p.green_print("sklearn coefficients:", cstring)
    h2p.green_print("sklearn intercept:", "%.5e" % clf1.intercept_[0])
    h2p.green_print("sklearn score:", clf1.score(train,target))

    # attributes are accessed in the normal python way
    dx = clf1.__dict__
    dx.keys()
예제 #7
0
    def test_summary2_NY0(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100, 200, 'x.hex', choicesList[4], expectedList[4]),
            (100, 200, 'x.hex', choicesList[5], expectedList[5]),
            (100, 200, 'x.hex', choicesList[6], expectedList[6]),
            (100, 200, 'x.hex', choicesList[7], expectedList[7]),
            (100, 200, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 200, 'x.hex', choicesList[2], expectedList[2]),
            (10000, 200, 'x.hex', choicesList[1], expectedList[1]),
            (100000, 200, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, choices)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(colCount):
                # walks across the columns triggering a summary on the col desired
                # runSummary returns a column object now. inspect and parse don't. They return json.
                # maybe eventually will make them return object? But I also pass expected stuff to them
                # should I pass expected to summary? no, more complex?
                co = h2o_cmd.runSummary(key=hex_key, column=i)
                print co.label, co.type, co.missing_count, co.domain, sum(
                    co.histogram_bins)

                print "\nComparing column %s to expected" % i
                self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \
                    (i, expectedNaCnt[i], co.missing_count))
                self.assertEqual(rowCount - expectedNaCnt[i],
                                 sum(co.histogram_bins))

            h2p.green_print("\nDone with trial", trial)
            trial += 1

            h2i.delete_keys_at_all_nodes()
예제 #8
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 2, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 100, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 1000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?
        
            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta


            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7,
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(qresult_iterations, 16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")


            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = twoDecimals(pctile)
            mx = twoDecimals(maxs)
            mn = twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if DO_TRY_SCIPY and colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                print scipyCol, pctile[10]
                generate_scipy_comparison(csvPathnameFull, col=scipyCol,
                     # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single)
                    h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult)



            h2i.delete_keys_at_all_nodes()
예제 #9
0
    def test_summary2_int2B(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100000, 1, 'B.hex', 2533255332, 2633256000,   ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) 
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            if expected[0]:
                self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # apparently we can't estimate any more
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0 
예제 #10
0
    def test_summary2_exp(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)),
            (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None,
                                                  None)),
            (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)),
            (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None,
                                           None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:",
                             h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )
예제 #11
0
def import_only(node=None, schema='local', bucket=None, path=None,
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, 
    importParentDir=True, **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.
    if schema=='put':
        h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +  
            "\nMeans multi-machine with 'put' will fail")
        schema = 'local'

    if src_key and schema!='put':
        raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o_nodes.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern)  = ("", path)

    verboseprint("head:", head)
    verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
           raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
           raise Exception("h2o path %s can't be regex. path= was %s" % (head, path))

    if schema=='put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
            raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path: 
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) 
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        
        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")
    
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)

        # hmm.. what should importResult be in the put case
        # set it to None. No import is done, and shouldn't be used if you're doing schema='put'
        importResult = None
        
        return (None, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder?
        # is it used for key matching by others?

        # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path
        # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets
        # importPattern = folderURI + "/" + pattern
        # could include this on the entire importPattern if we no longer have regex basename in h2o-dev?
          
        # folderURI = 'nfs:/' + folderPath
        folderURI = 'nfs:/' + os.path.realpath(folderPath)
        if importParentDir:
            finalImportString = folderPath
        else:
            finalImportString = folderPath + "/" + pattern
        importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')
    
        # strip leading / in head if present
        if bucket and head!="":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o_nodes.nodes[0]
        if schema=='s3' or node.redirect_import_folder_to_s3_path:
            # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"

            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='s3n' or node.redirect_import_folder_to_s3n_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution 
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        else: 
            raise Exception("schema not understood: %s" % schema)

    print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString
    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
예제 #12
0
파일: h2o_import.py 프로젝트: Eliak/h2o
def import_only(
    node=None,
    schema="local",
    bucket=None,
    path=None,
    timeoutSecs=30,
    retryDelaySecs=0.1,
    initialDelaySecs=0,
    pollTimeoutSecs=180,
    noise=None,
    benchmarkLogging=None,
    noPoll=False,
    doSummary=True,
    src_key=None,
    noPrint=False,
    importParentDir=True,
    **kwargs
):

    if src_key and schema != "put":
        raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node:
        node = h2o.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern) = ("", path)

    h2o.verboseprint("head:", head)
    h2o.verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
            raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
            raise Exception("h2o path %s can't be regex. path= was %s" % (head, path))

    if schema == "put":
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
            raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path:
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath)
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)

        # hmm.. what should importResult be in the put case
        # set it to None. No import is done, and shouldn't be used if you're doing schema='put'
        importResult = None

        return (None, key)

    if schema == "local" and not (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        folderURI = "nfs:/" + folderPath
        if importParentDir:
            importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs)
        else:
            importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip("/")

        # strip leading / in head if present
        if bucket and head != "":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o.nodes[0]
        if schema == "s3" or node.redirect_import_folder_to_s3_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"

            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema == "s3n" or node.redirect_import_folder_to_s3n_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs,
                    n.hdfs_version,
                    n.hdfs_name_node,
                )
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema == "maprfs":
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema == "hdfs":
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs,
                    n.hdfs_version,
                    n.hdfs_name_node,
                )
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        else:
            raise Exception("schema not understood: %s" % schema)

    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
예제 #13
0
def do_h2o_glm(self, bucket, csvPathname, L, family="binomial"):

    h2p.red_print("\nNow doing h2o")
    h2o.beta_features = True
    parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="local", timeoutSecs=180)
    # save the resolved pathname for use in the sklearn csv read below

    inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
    print inspect
    print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
        inspect["numCols"]
    )

    x = "ID"
    y = "CAPSULE"
    family = family
    alpha = "0"
    lambda_ = L
    nfolds = "0"
    f = "prostate"
    modelKey = "GLM_" + f

    kwargs = {
        "response": y,
        "ignored_cols": x,
        "family": family,
        "lambda": lambda_,
        "alpha": alpha,
        "n_folds": nfolds,  # passes if 0, fails otherwise
        "destination_key": modelKey,
    }

    timeoutSecs = 60
    start = time.time()
    glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

    # this stuff was left over from when we got the result after polling the jobs list
    # okay to do it again
    # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling)
    (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs)
    cstring = "".join([("%.5e  " % c) for c in clist])
    h2p.green_print("h2o alpha ", alpha)
    h2p.green_print("h2o lambda ", lambda_)
    h2p.green_print("h2o coefficient list:", cstring)
    h2p.green_print("h2o intercept", "%.5e  " % intercept)

    # other stuff in the json response
    glm_model = glmResult["glm_model"]
    _names = glm_model["_names"]
    coefficients_names = glm_model["coefficients_names"]

    # the first submodel is the right one, if onely one lambda is provided as a parameter above
    submodels = glm_model["submodels"][0]

    beta = submodels["beta"]
    h2p.red_print("beta:", beta)
    norm_beta = submodels["norm_beta"]
    iteration = submodels["iteration"]

    validation = submodels["validation"]
    avg_err = validation["avg_err"]
    auc = validation["auc"]
    aic = validation["aic"]
    null_deviance = validation["null_deviance"]
    residual_deviance = validation["residual_deviance"]

    print "_names", _names
    print "coefficients_names", coefficients_names
    # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
    print "beta", beta
    print "iteration", iteration
    print "avg_err", avg_err
    print "auc", auc
예제 #14
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]
            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    )

            h2o.nodes[0].remove_all_keys()
예제 #15
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,
             ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,
             ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0,
             ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]),
            (ROWS, 1, 'A.hex', 1.0, 100.0,
             ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,
             ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]),
            (ROWS, 1, 'B.hex', 1.0, 10000.0,
             ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
            (ROWS, 1, 'C.hex', 1.0, 100000.0,
             ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount,
                                                       colCount, expectedMin,
                                                       expectedMax,
                                                       SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange / (MAX_QBINS - 2)
            maxDelta = 1 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight?
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=column['colname'],
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2,
                                       interpolation_type=7)  # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       rel=.00001,
                                       msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       rel=.00001,
                                       msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance,
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance,
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance,
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                )

            h2o.nodes[0].remove_all_keys()
예제 #16
0
파일: binquant.py 프로젝트: earlh/h2o
def findQuantile(d, dmin, dmax, threshold):
    # return the value at the threshold, or the mean of the two rows that bound it.
    # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer
    maxIterations = 30

    # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere)
    totalRows = len(d)
    # Used to have 
    desiredBinCnt = BIN_COUNT
    maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues

    # initialize
    newValStart = dmin
    newValEnd   = dmax
    newValRange = newValEnd - newValStart
    desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine.
    newValBinSize  = newValRange / (desiredBinCnt + 0.0)
    newLowCount = 0 # count of rows below the bins
    # yes there is no newHighCount. Created during the pass, though.

    # state shared by each pass
    assert maxBinCnt > 0

    hcnt2 = [None for b in range(maxBinCnt)]
    hcnt2_min = [None for b in range(maxBinCnt)]
    hcnt2_max = [None for b in range(maxBinCnt)]
    hcnt2_low = 0
    hcnt2_high = 0

    assert newValBinSize != 0 # can be negative
    assert newValEnd > newValStart
    assert newValRange > 0

    # break out on stopping condition
    # reuse the histogram array hcnt2[]
    iteration = 0
    done = False
    # append to a list of best guesses per pass
    best_result = []

    def htot2():
        return sum(hcnt2) + hcnt2_low + hcnt2_high
        
    while iteration <= maxIterations and not done:
        h2p.green_print("newValStart", newValStart)
        h2p.green_print("newValEnd", newValEnd)
        h2p.green_print("newValRange", newValRange)
        h2p.green_print("newValBinSize", newValBinSize)
        h2p.green_print("newLowCount", newLowCount)
        h2p.green_print("threshold", threshold)

        valStart = newValStart
        valEnd   = newValEnd
        valRange = newValRange
        valBinSize = newValBinSize
        lowCount = newLowCount
        desiredBinCnt = BIN_COUNT
        maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues

        # playing with creating relative NUDGE values to make sure bin range
        # is always inclusive of target.
        # ratio it down from valBinSize. 
        # It doesn't need to be as big as valBinSize.
        # implicitly, it shouldn't need to be as large as valBinSize
        # can't seem to make it work yet. leave NUDGE=0
        NUDGE = 0

        # init to zero for each pass
        for b in range(maxBinCnt):
            hcnt2[b] = 0.0

        # Init counts outside of the bins
        hcnt2_low = 0
        hcnt2_high = 0

        # minimum value for higher than the bin. Needed for interpolation
        hcnt2_high_min = None

        for val in d:
            # Need to count the stuff outside the bin-gathering, 
            # since threshold compare is based on total row compare
            # on first pass, shouldn't see anything exceed the start/end bounds
            # since those are min/max for the column? (shouldn't be any fp precision issue? or ??)
            # oh wait, this valOffset math creates possible precision issue?
            # maybe we should address it with the NUDGE value below? but what about first pass?
            valOffset = val - valStart
            # where are we zeroing in? (start)
            binIdx2 = int(math.floor(valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide?

            # do some close looking for possible fp arith issues
            cA = valOffset < 0
            cB = binIdx2 < 0
            t = {True: 1, False: 0}
            # we get the 10 case
            if ((cA and not cB) or (not cA and cB)):
                h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \
                    "binIdx2", binIdx2)
            cC = val > valEnd
            cD = binIdx2 >= (maxBinCnt-1) # tighten the compare for printing
            if ((cC and not cD) or (not cC and cD)):
                h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \
                    "binIdx2", binIdx2, "maxBinCnt", maxBinCnt)
                # example hits this case..i.e. the max value
                # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3
                
            if valOffset < 0 or binIdx2<0:
            # if valOffset < 0:
            # if binIdx2<0:
                hcnt2_low += 1
            # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure!
            # have to use both compares, since can wrap the index (due to start/end shift)
            # elif val > valEnd or binIdx2>=(maxBinCnt-1):
            # should this really be a valOffset compare?
            elif val > valEnd or binIdx2 >= maxBinCnt:
            # elif val > valEnd:
            # elif binIdx2>=(maxBinCnt-1):
                if (hcnt2_high==0) or (val < hcnt2_high_min):
                    hcnt2_high_min = val;
                    print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd
                hcnt2_high += 1
            else:
                # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize

                assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \
                    (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize)
                if hcnt2[binIdx2]==0 or (val < hcnt2_min[binIdx2]):
                    hcnt2_min[binIdx2] = val;
                if hcnt2[binIdx2]==0 or (val > hcnt2_max[binIdx2]):
                    hcnt2_max[binIdx2] = val;
                hcnt2[binIdx2] += 1

                # check if we went into the magic extra bin
                if binIdx2 == (maxBinCnt-1):
                    print "\nFP! val went into the extra maxBinCnt bin:", \
                    binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n"
        
            # check the legal states for these two
            # we don't have None for checking hcnt2_high_min in java
            assert hcnt2_high==0 or (hcnt2_high_min is not None)
            assert (hcnt2_high_min is None) or hcnt2_high!=0

        # everything should either be in low, the bins, or high
        totalBinnedRows = htot2()
        print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) 

        assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) 

        # now walk thru and find out what bin to look inside
        currentCnt = hcnt2_low
        targetCntFull = threshold * (totalRows-1)  # zero based indexing
        targetCntInt = int(math.floor(threshold * (totalRows-1)))
        targetCntFract = targetCntFull  - targetCntInt
        assert targetCntFract>=0 and targetCntFract<=1
        print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract

        k = 0
        while ((currentCnt + hcnt2[k]) <= targetCntInt): 
            # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k]
            currentCnt += hcnt2[k]
            # ugly but have to break out if we'd cycle along with == adding h0's until we go too far
            # are we supposed to advance to a none zero bin?
            k += 1 # goes over in the equal case?
            # if currentCnt >= targetCntInt:
            #     break
            if k==maxBinCnt:
                break
            assert k<maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % (k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k-1])

        # format string to match java Log.info() in Quantiles.java
        print "Found k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k]
        assert hcnt2[k]!=1 or hcnt2_min[k]==hcnt2_max[k]

        # some possibily interpolating guesses first, in guess we have to iterate (best guess)
        done = False
        guess = (hcnt2_max[k] - hcnt2_min[k]) / 2

        if currentCnt==targetCntInt:
            if hcnt2[k]>2 and (hcnt2_min[k]==hcnt2_max[k]):
                guess = hcnt2_min[k]
                print "Guess A", guess, k, hcnt2[k]

            if hcnt2[k]==2:
                print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop"
                # no mattter what size the fraction it would be on this number
                guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0
                # no mattter what size the fraction it would be on this number

                if INTERPOLATION_TYPE==2: # type 2 (mean)
                  guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0

                else: # default to type 7 (linear interpolation)
                  # Unlike mean, which just depends on two adjacent values, this adjustment  
                  # adds possible errors related to the arithmetic on the total # of rows.
                  dDiff = hcnt2_max[k] - hcnt2_min[k] # two adjacent..as if sorted!
                  pctDiff = targetCntFract # This is the fraction of total rows
                  guess = hcnt2_min[k] + (pctDiff * dDiff)

                done = False
                print "Guess B", guess

            if hcnt2[k]==1 and targetCntFract==0:
                assert hcnt2_min[k]==hcnt2_max[k]
                guess = hcnt2_min[k]
                done = True
                print "k", k
                print "Guess C", guess

            if hcnt2[k]==1 and targetCntFract!=0:
                assert hcnt2_min[k]==hcnt2_max[k]
                print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
                if k<maxBinCnt:
                    nextK = k + 1 # could put it over maxBinCnt
                else:
                    nextK = k
                while nextK<maxBinCnt and hcnt2[nextK]==0:
                    nextK += 1

                # have the "extra bin" for this
                if nextK >= maxBinCnt:
                    assert hcnt2_high!=0
                    print "Using hcnt2_high_min for interpolate:", hcnt2_high_min
                    nextVal = hcnt2_high_min
                else:
                    print "Using nextK for interpolate:", nextK
                    assert hcnt2[nextK]!=0
                    nextVal = hcnt2_min[nextK]

                guess = (hcnt2_max[k] + nextVal) / 2.0
                # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK

                if INTERPOLATION_TYPE==2: # type 2 (mean)
                    guess = (hcnt2_max[k] + nextVal) / 2.0
                    pctDiff = 0.5
                else: # default to type 7 (linear interpolation)
                    dDiff = nextVal - hcnt2_max[k] # two adjacent, as if sorted!
                    pctDiff = targetCntFract # This is the fraction of total rows
                    guess = hcnt2_max[k] + (pctDiff * dDiff)


                done = True # has to be one above us when needed. (or we're at end)

                print 'k', 'hcnt2_max[k]', 'nextVal'
                print "hello3:", k, hcnt2_max[k], nextVal
                print "\nInterpolating result using nextK: %s nextVal: %s" % (nextK, nextVal)
                print "Guess D", guess

        if not done:
            print "Not done, setting new range",\
                "k: ", k,\
                "currentCnt: ", currentCnt,\
                "hcnt2_min[k]: ", hcnt2_min[k],\
                "hcnt2_max[k]: ", hcnt2_max[k]

            # possible bin leakage at start/end edges due to fp arith.
            # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare
            # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you.
            # Just need to check the one bin below and above k, if they exist. 
            if k > 0 and hcnt2[k-1]>0 and (hcnt2_max[k-1]<hcnt2_min[k]):
                newValStart = hcnt2_max[k-1]
            else:
                newValStart = hcnt2_min[k]

            # subtle. we do put stuff in the extra end bin (see the print above that happens)
            # k might be pointing to one less than that (like k=0 for 1 bin case)
            if k < maxBinCnt and hcnt2[k+1]>0 and (hcnt2_min[k+1]>hcnt2_max[k]):
                print "hello"
                newValEnd = hcnt2_min[k+1]
            else:
                newValEnd = hcnt2_max[k]
            
            newValRange = newValEnd - newValStart 
            # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues?
            newValBinSize = newValRange / (desiredBinCnt + 0.0)
            
            # the start/end should never change if we're just using one bin
            # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations
            # assumes NUDGE is 0
            if NUDGE == 0.0:
                assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\
                    "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd)
            newLowCount = currentCnt
            if newValBinSize==0:
                # assert done or newValBinSize!=0 and live with current guess
                print "Assuming done because newValBinSize is 0."
                print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\
                     (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k])
                guess = newValStart
                print "Guess E", guess
                done = True

            # if we have to interpolate
            # if it falls into this bin, interpolate to this bin means one answer?

            # cover the case above with multiple entris in a bin, all the same value
            # will be zero on the last pass?
            # assert newValBinSize != 0 or done
            # need the count up to but not including newValStart

        best_result.append(guess)
        iteration += 1

        h2p.blue_print("Ending Pass", iteration)
        h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k])
        print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high
        print "was", valStart, valEnd, valRange, valBinSize
        print "next", newValStart, newValEnd, newValRange, newValBinSize

    return best_result[-1]
예제 #17
0
    def test_summary2_uniform_int_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'B.hex', 1, 1000 * M,
             ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)),
            (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0,
                                         1000.0)),
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0,
                                          20000.0)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0,
                                          -1250.0, 0)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0,
                                                 50000.0, 100000.0)),

            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0,
                                          10000.0)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0,
                                           100.0)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0,
                                           75001.0, 100000.0)),
            # (ROWS, 1, 'C.hex', -101, 101,          ('C1',  -101, -51, -1, 49.0, 100.0)),
        ]
        if not DO_REAL:
            # only 3 integer values!
            tryList.append(\
                (1000000, 1, 'x.hex', -1, 1,              ('C1',  -1.0, -1, 0.000, 1, 1.00)) \
                )

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0))
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=60,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )

            h2o.nodes[0].remove_all_keys()
예제 #18
0
def import_only(node=None,
                schema='local',
                bucket=None,
                path=None,
                timeoutSecs=30,
                retryDelaySecs=0.1,
                initialDelaySecs=0,
                pollTimeoutSecs=180,
                noise=None,
                benchmarkLogging=None,
                noPoll=False,
                doSummary=True,
                src_key=None,
                noPrint=False,
                importParentDir=True,
                **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.

    # if schema=='put':
    #    h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +
    #        "\nMeans multi-machine with 'put' will fail")
    #    schema = 'local'

    if src_key and schema != 'put':
        raise Exception(
            "can only specify a 'src_key' param for schema='put'. You have %s %s"
            % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o_nodes.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern) = ("", path)

    verboseprint("head:", head)
    verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
            raise Exception("h2o folder path %s can't be regex. path= was %s" %
                            (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
            raise Exception("h2o path %s can't be regex. path= was %s" %
                            (head, path))

    if schema == 'put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
            raise Exception(
                "h2o putfile basename %s can't be regex. path= was %s" %
                (pattern, path))

        if not path:
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        verboseprint("put filename:", filename, "folderPath:", folderPath,
                     "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o_args.python_test_name,
                            "uses put:/%s" % filePath)
            h2p.green_print("Local path to file that will be uploaded: %s" %
                            filePath)
            h2p.blue_print("That path resolves as:",
                           os.path.realpath(filePath))

        if h2o_args.abort_after_import:
            raise Exception(
                "Aborting due to abort_after_import (-aai) argument's effect in import_only()"
            )

        # h2o-dev: it always wants a key name
        if src_key is None:
            src_key = filename
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)

        # hmm.. what should importResult be in the put case
        # set it to None. No import is done, and shouldn't be used if you're doing schema='put'
        # ..make it look like an import files result..This is just for test consistency
        importResult = json.loads('{\
          "dels": [],\
          "fails": [],\
          "files": ["%s"],\
          "keys": ["%s"],\
          "path": "%s",\
          "schema_name": null, "schema_type": null, "schema_version": null\
        }' % (filename, src_key, filePath))
        return (importResult, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o_args.python_test_name,
                        "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:",
                       os.path.realpath(filePath))
        if h2o_args.abort_after_import:
            raise Exception(
                "Aborting due to abort_after_import (-aai) argument's effect in import_only()"
            )

        # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder?
        # is it used for key matching by others?

        # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path
        # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets
        # importPattern = folderURI + "/" + pattern
        # could include this on the entire importPattern if we no longer have regex basename in h2o-dev?

        folderURI = 'nfs:/' + folderPath
        # folderURI = 'nfs:/' + os.path.realpath(folderPath)
        if importParentDir:
            finalImportString = folderPath
        else:
            finalImportString = folderPath + "/" + pattern
        importResult = node.import_files(finalImportString,
                                         timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            verboseprint("You said bucket:", bucket,
                         "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')

        # strip leading / in head if present
        if bucket and head != "":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        if h2o_args.abort_after_import:
            raise Exception(
                "Aborting due to abort_after_import (-aai) argument's effect in import_only()"
            )

        n = h2o_nodes.nodes[0]
        if schema == 's3' or node.redirect_import_folder_to_s3_path:
            # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"

            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        elif schema == 's3n' or node.redirect_import_folder_to_s3n_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            if not (n.use_hdfs and
                    ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        elif schema == 'maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        elif schema == 'hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and
                    ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        else:
            raise Exception("schema not understood: %s" % schema)

    print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString
    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
예제 #19
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,        ['C1',  0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,        ['C1',  -5000.0, -3750.0, -2550.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1',  -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0,           ['C1',  -1.0, -0.50, 0.0, 0.50, 1.0]),

            (ROWS, 1, 'A.hex', 1.0, 100.0,          ['C1',   1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,         ['C1',  -99.0, -50.0, 0.0, 50.0, 99.0]),

            (ROWS, 1, 'B.hex', 1.0, 10000.0,        ['C1',   1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),

            (ROWS, 1, 'C.hex', 1.0, 100000.0,       ['C1',   1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, 
                expectedMin, expectedMax, SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange/(MAX_QBINS-2)
            maxDelta = 0.5 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight? 
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta 

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, 
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, 
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, 
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1


            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                    )

            h2o.nodes[0].remove_all_keys()
예제 #20
0
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (100000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (100000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (100000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (100000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) 
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i!=0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                    h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i]))
                    if not result:
                        raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold))
                    h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, 
                        msg='exec percentile: %s too different from expected: %s' % (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1==0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2') 
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols,1)
                    self.assertEqual(numRows,len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                    )

            h2o.nodes[0].remove_all_keys()
    def test_summary2_uniform_int_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, "B.hex", 1, 1000 * M, ("C1", 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)),
            (ROWS, 1, "B.hex", 1, 1000, ("C1", 1.0, 250.0, 500.0, 750.0, 1000.0)),
            (ROWS, 1, "x.hex", 1, 20000, ("C1", 1.0, 5000.0, 10000.0, 15000.0, 20000.0)),
            (ROWS, 1, "x.hex", -5000, 0, ("C1", -5000.00, -3750.0, -2500.0, -1250.0, 0)),
            (ROWS, 1, "x.hex", -100000, 100000, ("C1", -100000.0, -50000.0, 0, 50000.0, 100000.0)),
            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),
            (ROWS, 1, "B.hex", 1, 10000, ("C1", 1.0, 2501.0, 5001.0, 7501.0, 10000.0)),
            (ROWS, 1, "B.hex", -100, 100, ("C1", -100.0, -50.0, 0.0, 50.0, 100.0)),
            (ROWS, 1, "C.hex", 1, 100000, ("C1", 1.0, 25001.0, 50001.0, 75001.0, 100000.0)),
            # (ROWS, 1, 'C.hex', -101, 101,          ('C1',  -101, -51, -1, 49.0, 100.0)),
        ]
        if not DO_REAL:
            # only 3 integer values!
            tryList.append((1000000, 1, "x.hex", -1, 1, ("C1", -1.0, -1, 0.000, 1, 1.00)))

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?

            maxDelta = (expectedMax - expectedMin) / (MAX_QBINS + 0.0)
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * 0.01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=60, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult["summaries"][0]
            colname = column["colname"]
            self.assertEqual(colname, expected[0])

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            h2o_util.assertApproxEqual(
                pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
            )
            h2o_util.assertApproxEqual(
                pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
            )
            h2o_util.assertApproxEqual(
                pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
            )

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, rowCount / len(hcnt), delta=0.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )

            h2o.nodes[0].remove_all_keys()
예제 #22
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (5000000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (5000000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (1000000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (1000000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (1000000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (1000000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (1000000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (1000000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (1000000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (1000000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = twoDecimals(pctile)
            mx = twoDecimals(maxs)
            mn = twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, trial in enumerate(thresholds):
                execExpr = "quantile(%s[,1], c(%s));" % (hex_key, trial)
                (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                ex = twoDecimals(result)
                h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (trial, ex, pt[i]))
                h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='percentile: % is not expected: %s' % (result, pctile[i]))

            if DO_TRY_SCIPY:
                generate_scipy_comparison(csvPathnameFull)
예제 #23
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
# parse setup error
#            (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
#            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
#            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
#            (1000000, 1, 'A.hex', 1, 100,          ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k,v in column.iteritems():
                    setattr(self, k, v) # achieves self.k = v

        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, 
                rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', 
                hex_key=hex_key, timeoutSecs=30, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
            print "\n" + csvFilename
            # column 0?
            summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1')
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # default_pctiles
            # isText
            # rows
            # off
            # key
            # checksum

            # only one column
            columns = summaryResult['frames'][0]['columns']
            default_pctiles = summaryResult['frames'][0]['default_pctiles']
            co = Column(columns[0])
            # how are enums binned. Stride of 1? (what about domain values)
            coList = [
                co.base,
                len(co.bins),
                len(co.data),
                co.domain,
                co.label,
                co.maxs,
                co.mean,
                co.mins,
                co.missing,
                co.ninfs,
                co.pctiles,
                co.pinfs,
                co.precision,
                co.sigma,
                co.str_data,
                co.stride,
                co.type,
                co.zeros,
                ]

            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)

            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            pctiles = [0] + co.pctiles + [0]
            
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(co.label, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            if expected[1] and expected[5]:
                expectedRange = expected[5] - expected[1]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
            print "co.label:", co.label, "co.maxs (2 places):", mx
            print "co.label:", co.label, "co.mins (2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "h2oSummary2MaxErr", maxErr
            if co.label!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctiles[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
예제 #24
0
def findQuantile(d, dmin, dmax, threshold):
    # return the value at the threshold, or the mean of the two rows that bound it.
    # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer
    maxIterations = 30

    # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere)
    totalRows = len(d)
    # Used to have
    desiredBinCnt = BIN_COUNT
    maxBinCnt = desiredBinCnt + 1  # might go one over due to FP issues

    # initialize
    newValStart = dmin
    newValEnd = dmax
    newValRange = newValEnd - newValStart
    desiredBinCnt = BIN_COUNT  # Could do per-pass adjustment, but fixed works fine.
    newValBinSize = newValRange / (desiredBinCnt + 0.0)
    newLowCount = 0  # count of rows below the bins
    # yes there is no newHighCount. Created during the pass, though.

    # state shared by each pass
    assert maxBinCnt > 0

    hcnt2 = [None for b in range(maxBinCnt)]
    hcnt2_min = [None for b in range(maxBinCnt)]
    hcnt2_max = [None for b in range(maxBinCnt)]
    hcnt2_low = 0
    hcnt2_high = 0

    assert newValBinSize != 0  # can be negative
    assert newValEnd > newValStart
    assert newValRange > 0

    # break out on stopping condition
    # reuse the histogram array hcnt2[]
    iteration = 0
    done = False
    # append to a list of best guesses per pass
    best_result = []

    def htot2():
        return sum(hcnt2) + hcnt2_low + hcnt2_high

    while iteration <= maxIterations and not done:
        h2p.green_print("newValStart", newValStart)
        h2p.green_print("newValEnd", newValEnd)
        h2p.green_print("newValRange", newValRange)
        h2p.green_print("newValBinSize", newValBinSize)
        h2p.green_print("newLowCount", newLowCount)
        h2p.green_print("threshold", threshold)

        valStart = newValStart
        valEnd = newValEnd
        valRange = newValRange
        valBinSize = newValBinSize
        lowCount = newLowCount
        desiredBinCnt = BIN_COUNT
        maxBinCnt = desiredBinCnt + 1  # might go one over due to FP issues

        # playing with creating relative NUDGE values to make sure bin range
        # is always inclusive of target.
        # ratio it down from valBinSize.
        # It doesn't need to be as big as valBinSize.
        # implicitly, it shouldn't need to be as large as valBinSize
        # can't seem to make it work yet. leave NUDGE=0
        NUDGE = 0

        # init to zero for each pass
        for b in range(maxBinCnt):
            hcnt2[b] = 0.0

        # Init counts outside of the bins
        hcnt2_low = 0
        hcnt2_high = 0

        # minimum value for higher than the bin. Needed for interpolation
        hcnt2_high_min = None

        for val in d:
            # Need to count the stuff outside the bin-gathering,
            # since threshold compare is based on total row compare
            # on first pass, shouldn't see anything exceed the start/end bounds
            # since those are min/max for the column? (shouldn't be any fp precision issue? or ??)
            # oh wait, this valOffset math creates possible precision issue?
            # maybe we should address it with the NUDGE value below? but what about first pass?
            valOffset = val - valStart
            # where are we zeroing in? (start)
            binIdx2 = int(math.floor(
                valOffset /
                (valBinSize + 0.0)))  # make sure it's always an fp divide?

            # do some close looking for possible fp arith issues
            cA = valOffset < 0
            cB = binIdx2 < 0
            t = {True: 1, False: 0}
            # we get the 10 case
            if ((cA and not cB) or (not cA and cB)):
                h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \
                    "binIdx2", binIdx2)
            cC = val > valEnd
            cD = binIdx2 >= (maxBinCnt - 1)  # tighten the compare for printing
            if ((cC and not cD) or (not cC and cD)):
                h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \
                    "binIdx2", binIdx2, "maxBinCnt", maxBinCnt)
                # example hits this case..i.e. the max value
                # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3

            if valOffset < 0 or binIdx2 < 0:
                # if valOffset < 0:
                # if binIdx2<0:
                hcnt2_low += 1
            # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure!
            # have to use both compares, since can wrap the index (due to start/end shift)
            # elif val > valEnd or binIdx2>=(maxBinCnt-1):
            # should this really be a valOffset compare?
            elif val > valEnd or binIdx2 >= maxBinCnt:
                # elif val > valEnd:
                # elif binIdx2>=(maxBinCnt-1):
                if (hcnt2_high == 0) or (val < hcnt2_high_min):
                    hcnt2_high_min = val
                    print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd
                hcnt2_high += 1
            else:
                # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize

                assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \
                    (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize)
                if hcnt2[binIdx2] == 0 or (val < hcnt2_min[binIdx2]):
                    hcnt2_min[binIdx2] = val
                if hcnt2[binIdx2] == 0 or (val > hcnt2_max[binIdx2]):
                    hcnt2_max[binIdx2] = val
                hcnt2[binIdx2] += 1

                # check if we went into the magic extra bin
                if binIdx2 == (maxBinCnt - 1):
                    print "\nFP! val went into the extra maxBinCnt bin:", \
                    binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n"

            # check the legal states for these two
            # we don't have None for checking hcnt2_high_min in java
            assert hcnt2_high == 0 or (hcnt2_high_min is not None)
            assert (hcnt2_high_min is None) or hcnt2_high != 0

        # everything should either be in low, the bins, or high
        totalBinnedRows = htot2()
        print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high)

        assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high)

        # now walk thru and find out what bin to look inside
        currentCnt = hcnt2_low
        targetCntFull = threshold * (totalRows - 1)  # zero based indexing
        targetCntInt = int(math.floor(threshold * (totalRows - 1)))
        targetCntFract = targetCntFull - targetCntInt
        assert targetCntFract >= 0 and targetCntFract <= 1
        print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract

        k = 0
        while ((currentCnt + hcnt2[k]) <= targetCntInt):
            # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k]
            currentCnt += hcnt2[k]
            # ugly but have to break out if we'd cycle along with == adding h0's until we go too far
            # are we supposed to advance to a none zero bin?
            k += 1  # goes over in the equal case?
            # if currentCnt >= targetCntInt:
            #     break
            if k == maxBinCnt:
                break
            assert k < maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % (
                k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k - 1])

        # format string to match java Log.info() in Quantiles.java
        print "Found k (multi): ", k, " ", currentCnt, " ", targetCntInt, " ", totalRows, " ", hcnt2[
            k], " ", hcnt2_min[k], " ", hcnt2_max[k]
        assert hcnt2[k] != 1 or hcnt2_min[k] == hcnt2_max[k]

        # some possibily interpolating guesses first, in guess we have to iterate (best guess)
        done = False
        guess = (hcnt2_max[k] - hcnt2_min[k]) / 2

        # we maight not have gottent all the way
        if currentCnt == targetCntInt:
            if hcnt2[k] > 2 and (hcnt2_min[k] == hcnt2_max[k]):
                guess = hcnt2_min[k]
                print "Guess A", guess, k, hcnt2[k]

            if hcnt2[k] == 2:
                print "hello"
                print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop"
                # no mattter what size the fraction it would be on this number
                guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0
                # no mattter what size the fraction it would be on this number

                if INTERPOLATION_TYPE == 2:  # type 2 (mean)
                    guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0

                else:  # default to type 7 (linear interpolation)
                    # Unlike mean, which just depends on two adjacent values, this adjustment
                    # adds possible errors related to the arithmetic on the total # of rows.
                    dDiff = hcnt2_max[k] - hcnt2_min[
                        k]  # two adjacent..as if sorted!
                    pctDiff = targetCntFract  # This is the fraction of total rows
                    guess = hcnt2_min[k] + (pctDiff * dDiff)

                done = False
                print "Guess B", guess

            if hcnt2[k] == 1 and targetCntFract == 0:
                assert hcnt2_min[k] == hcnt2_max[k]
                guess = hcnt2_min[k]
                done = True
                print "k", k
                print "Guess C", guess

            if hcnt2[k] == 1 and targetCntFract != 0:
                assert hcnt2_min[k] == hcnt2_max[k]
                print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
                if k < maxBinCnt:
                    nextK = k + 1  # could put it over maxBinCnt
                else:
                    nextK = k
                while nextK < maxBinCnt and hcnt2[nextK] == 0:
                    nextK += 1

                # have the "extra bin" for this
                if nextK >= maxBinCnt:
                    assert hcnt2_high != 0
                    print "Using hcnt2_high_min for interpolate:", hcnt2_high_min
                    nextVal = hcnt2_high_min
                else:
                    print "Using nextK for interpolate:", nextK
                    assert hcnt2[nextK] != 0
                    nextVal = hcnt2_min[nextK]

                guess = (hcnt2_max[k] + nextVal) / 2.0
                # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK

                if INTERPOLATION_TYPE == 2:  # type 2 (mean)
                    guess = (hcnt2_max[k] + nextVal) / 2.0
                    pctDiff = 0.5
                else:  # default to type 7 (linear interpolation)
                    dDiff = nextVal - hcnt2_max[
                        k]  # two adjacent, as if sorted!
                    pctDiff = targetCntFract  # This is the fraction of total rows
                    guess = hcnt2_max[k] + (pctDiff * dDiff)

                done = True  # has to be one above us when needed. (or we're at end)

                print 'k', 'hcnt2_max[k]', 'nextVal'
                print "hello3:", k, hcnt2_max[k], nextVal
                print "\nInterpolating result using nextK: %s nextVal: %s" % (
                    nextK, nextVal)
                print "Guess D", guess

        if not done:
            print "%s %s %s %s Not done, setting new range" % (hcnt2[k], currentCnt, targetCntInt, targetCntFract),\
                "k: ", k,\
                "currentCnt: ", currentCnt,\
                "hcnt2_min[k]: ", hcnt2_min[k],\
                "hcnt2_max[k]: ", hcnt2_max[k]

            # possible bin leakage at start/end edges due to fp arith.
            # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare
            # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you.
            # Just need to check the one bin below and above k, if they exist.
            if k > 0 and hcnt2[k - 1] > 0 and (hcnt2_max[k - 1] <
                                               hcnt2_min[k]):
                print "1"
                newValStart = hcnt2_max[k - 1]
            else:
                print "2"
                newValStart = hcnt2_min[k]

            # subtle. we do put stuff in the extra end bin (see the print above that happens)
            # k might be pointing to one less than that (like k=0 for 1 bin case)
            if k < maxBinCnt and hcnt2[k + 1] > 0 and (hcnt2_min[k + 1] >
                                                       hcnt2_max[k]):
                print "3"
                newValEnd = hcnt2_min[k + 1]
            else:
                print "4"
                newValEnd = hcnt2_max[k]

            newValRange = newValEnd - newValStart
            # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues?
            newValBinSize = newValRange / (desiredBinCnt + 0.0)

            # the start/end should never change if we're just using one bin
            # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations
            # assumes NUDGE is 0
            if NUDGE == 0.0:
                assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\
                    "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd)
            newLowCount = currentCnt
            if newValBinSize == 0:
                # assert done or newValBinSize!=0 and live with current guess
                print "Assuming done because newValBinSize is 0."
                print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\
                     (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k])
                guess = newValStart
                print "Guess E", guess
                # was done = True 3/20/14
                done = True

            # if we have to interpolate
            # if it falls into this bin, interpolate to this bin means one answer?

            # cover the case above with multiple entries in a bin, all the same value
            # will be zero on the last pass?
            # assert newValBinSize != 0 or done
            # need the count up to but not including newValStart

        best_result.append(guess)
        iteration += 1

        h2p.blue_print("Ending Pass", iteration)
        h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]",
                       hcnt2[k])
        print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high
        print "was", valStart, valEnd, valRange, valBinSize
        print "next", newValStart, newValEnd, newValRange, newValBinSize

    return best_result[-1]
예제 #25
0
def runSummary(node=None,
               key=None,
               column=None,
               expected=None,
               maxDelta=None,
               noPrint=False,
               **kwargs):
    if not key: raise Exception('No key for Summary')
    if not node: node = h2o_nodes.nodes[0]
    # return node.summary(key, **kwargs)

    i = InspectObj(key=key)
    # just so I don't have to change names below
    missingList = i.missingList
    labelList = i.labelList
    numRows = i.numRows
    numCols = i.numCols
    print "labelList:", labelList
    assert labelList is not None

    # doesn't take indices? only column labels?
    # return first column, unless specified

    if not (column is None or isinstance(column, (basestring, int))):
        raise Exception(
            "column param should be string or integer index or None %s %s" %
            (type(column), column))

    # either return the first col, or the col indentified by label. the column identifed could be string or index?
    if column is None:  # means the summary json when we ask for col 0, will be what we return (do all though)
        colNameToDo = labelList
        colIndexToDo = range(len(labelList))
    elif isinstance(column, int):
        colNameToDo = [labelList[column]]
        colIndexToDo = [column]
    elif isinstance(column, basestring):
        colNameToDo = [column]
        if column not in labelList:
            raise Exception("% not in labellist: %s" % (column, labellist))
        colIndexToDo = [labelList.index(column)]
    else:
        raise Exception("wrong type %s for column %s" % (type(column), column))

    # we get the first column as result after walking across all, if no column parameter
    desiredResult = None
    for (colIndex, colName) in zip(colIndexToDo, colNameToDo):
        print "doing summary on %s %s" % (colIndex, colName)
        # ugly looking up the colIndex
        co = SummaryObj(key=key, colIndex=colIndex, colName=colName)
        if not desiredResult:
            desiredResult = co

        if not noPrint:
            for k, v in co:
                # only print [0] of mins and maxs because of the e308 values when they don't have dataset values
                if k == 'mins' or k == 'maxs':
                    print "%s[0]" % k, v[0]
                else:
                    print k, v

        if expected is not None:
            print "len(co.histogram_bins):", len(co.histogram_bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(
                co.sigma)

            # print "FIX! hacking the co.percentiles because it's short by two"
            # if co.percentiles:
            #     percentiles = [0] + co.percentiles + [0]
            # else:
            #     percentiles = None
            percentiles = co.percentiles
            assert len(co.percentiles) == len(co.default_percentiles)

            # the thresholds h2o used, should match what we expected
            # expected = [0] * 5
            # Fix. doesn't check for expected = 0?

            # max of one bin
            if maxDelta is None:
                maxDelta = (co.maxs[0] - co.mins[0]) / 1000

            if expected[0]:
                h2o_util.assertApproxEqual(co.mins[0],
                                           expected[0],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[1]:
                h2o_util.assertApproxEqual(
                    percentiles[2],
                    expected[1],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    percentiles[4],
                    expected[2],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    percentiles[6],
                    expected[3],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(co.maxs[0],
                                           expected[4],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            MAX_QBINS = 1000
            if expected[0] and expected[4]:
                expectedRange = expected[4] - expected[0]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange / (MAX_QBINS - 2)
                maxErr = expectedBin  # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(percentiles)

            # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column
            mx = h2o_util.twoDecimals(co.maxs[0])
            mn = h2o_util.twoDecimals(co.mins[0])

            print "co.label:", co.label, "co.percentiles (2 places):", pt
            print "co.default_percentiles:", co.default_percentiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! why would percentiles be None? enums?
            if pt is None:
                compareActual = mn, [None] * 3, mx
            else:
                compareActual = mn, pt[2], pt[4], pt[6], mx

            h2p.green_print("actual min/25/50/75/max co.label:", co.label,
                            "(2 places):", compareActual)
            h2p.green_print("expected min/25/50/75/max co.label:", co.label,
                            "(2 places):", expected)

    return desiredResult
예제 #26
0
    def test_rand_inspect(self):
        ### h2b.browseTheCloud()

        ### h2b.browseTheCloud()
        csvFilename = 'covtype.data'
        csvPathname = 'UCI/UCI-large/covtype/'+ csvFilename
        hex_key = csvFilename + ".hex"
        print "\n" + csvPathname

        parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
        destination_key = parseResult['destination_key']
        print csvFilename, 'parse time:', parseResult['response']['time']
        print "Parse result['destination_key']:", destination_key 

        def inspect_and_check(nodeX, destination_key, offset, view, inspectOld=None):
            inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view)
            if h2o.beta_features:
                pass
                # print "Inspect2:", h2o.dump_json(inspectNew)
            else:
                pass
                # print "Inspect:", h2o.dump_json(inspectNew)

            # FIX! get min/max/mean/variance for a col too?
            constantNames = [
                ('num_cols', 'numCols'),
                ('num_rows', 'numRows'),
                ('value_size_bytes', 'byteSize'),
                ('cols', 'cols'),
                ]

            colNames = [
                ('num_missing_values', 'naCnt'),
                ]

            for (i,j) in constantNames:
                # check the fields, even if you don't have a previous one to compare to
                if h2o.beta_features:
                    # hack in extra info for now, from the new names to old names
                    if not j in inspectNew:
                        raise Exception("Can't find %s, Inspect2 result should have it?" % j)
                    inspectNew[i] = inspectNew[j]

                # don't compare if cols
                if inspectOld  and i != 'cols':
                    if h2o.beta_features and i=='value_size_bytes': # Inspect2 should be smaller
                        self.assertGreater(inspectOld[i], inspectNew[i])
                        
                    else:
                        # for cols it will just compare length?
                        self.assertEqual(inspectOld[i], inspectNew[i])

                if i=='cols':
                    for (m,n) in colNames:
                        if h2o.beta_features:
                            if not n in inspectNew[i][0]:
                                print h2o.dump_json(inspectNew[i][0])
                                raise Exception("Can't find %s, Inspect2 result['cols'][0] should have it?" % n)
                            inspectNew[i][0][m] = inspectNew[i][0][n]
                        # just compare 0
                        if inspectOld is not None:
                            self.assertEqual(inspectOld[i][0][m], inspectNew[i][0][m])

            return inspectNew

        # going to use this to compare against future. num_rows/num_cols should always
        # be the same, regardless of the view. just a coarse sanity check
        origInspect = inspect_and_check(0, destination_key, 0, 1, None)
        h2o.verboseprint(h2o.dump_json(origInspect))
        origStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60)

        num_rows = origInspect['num_rows']
        num_cols = origInspect['num_cols']

        lenNodes = len(h2o.nodes)
        for trial in range (10):
            h2p.green_print("\nTrial", trial)
            # we want to use the boundary conditions, so have two level of random choices
            offset = good_choices(num_rows)
            view = good_choices(num_cols)
            # randomize the node used
            nodeX = random.randint(0,lenNodes-1)
            print "nodeX:", nodeX, "offset:", offset, "view:", view
            h2o.beta_features = False
            inspect_and_check(nodeX,destination_key,offset,view,origInspect)
            print "trying Inspect2 by flipping h2o.nodes[0].beta_features"
            h2o.beta_features = True
            # delay between the two inspects...bug around not getting autoframe in storeview?
            time.sleep(1)            
            inspect_and_check(nodeX,destination_key,offset,view,origInspect)
            h2o.beta_features = False

            # a fvec frame should have been created in the storeView
            time.sleep(1)            

            # loop looking for the autoframe to show up
            # o = len(origStoreViewResult['keys'])
            o = h2i.count_keys_at_all_nodes()
            retry = 0
            okay = False
            while retry==0 or not okay:
                newStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60)
                ## p = len(newStoreViewResult['keys'])
                p = h2i.count_keys_at_all_nodes()
                print "number of keys in the two StoreViews, o:", o, "p:", p
                ## print "newStoreViewResult:", h2o.dump_json(newStoreViewResult)
                oOkay = {1, 2, 3, 4, 5, 6}
                pOkay = {1, 2, 3, 4, 5}
                print o, pOkay, p, oOkay
                if (o in oOkay) and (p in pOkay):
                    print "Good"
                    okay = True
                else:
                    print "Unexpected o,p after autoframe, looking at total keys in system: %s %s" % (o,p)

                if retry==10:
                    raise Exception("StoreView didn't get autoframe, after %s retries" % retry)
                ## h2b.browseJsonHistoryAsUrlLastMatch("StoreView")

                # so he gets recreated??
                deleted = h2i.delete_keys_at_all_nodes(pattern='autoframe')
                # The autoframe key may not show up!!
                if INVISIBLE_AUTOFRAME:
                    # can be 1 or 2
                    if not(deleted==0 or deleted==1):
                        msg = "Should have deleted a total of 0 or 1 keys, looking at all nodes. Did %s" % deleted
                        raise Exception(msg)
                else:
                    # can be 1 or 2
                    if not(deleted==1):
                        msg = "Should have deleted a total of 1 keys, looking at all nodes. Did %s" % deleted
                
                time.sleep(1)
                retry += 1
예제 #27
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5 * ROWS, 1, 'x.hex', 1, 20000,
             ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5 * ROWS, 1, 'x.hex', -5000, 0,
             ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1 * ROWS, 1, 'x.hex', -100000, 100000,
             ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1 * ROWS, 1, 'x.hex', -1, 1,
             ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]),
            (1 * ROWS, 1, 'A.hex', 1, 100,
             ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]),
            (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]),
            (1 * ROWS, 1, 'B.hex', 1, 10000,
             ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1 * ROWS, 1, 'B.hex', -100, 100,
             ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]),
            (1 * ROWS, 1, 'C.hex', 1, 100000,
             ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1 * ROWS, 1, 'C.hex', -101, 101,
             ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [
                co.base,
                len(co.bins),
                len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins,
                co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision,
                co.sigma, co.str_data, co.stride, co.type, co.zeros
            ]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(
                co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr = "[%s]" % ",".join(map(str, probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(algo='quantile',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][
                0]  # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                )
            h2o.nodes[0].remove_all_keys()
예제 #28
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (5,     1, 'x.hex', 1, 20000,         ['C1', None, None, None, None, None]),
            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
            (1000000, 1, 'A.hex', 1, 100,         ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, 
                rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct= [0.001, 0.001, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            if expected[1] and expected[5]:
                expectedRange = expected[5] - expected[1]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0


            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
예제 #29
0
    def test_summary2_small(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 100, 'x.hex', [-1, 0,
                                  1], ('C1', None, None, 0, None, None)),
            (None, 1000, 'x.hex', [-1, 0,
                                   1], ('C1', None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values,
                              SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS,
                                               timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=0,
                                       interpolation_type=7,
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg=
                "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?"
            )

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       numRows / len(hcnt),
                                       delta=1 + .01 * numRows,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
예제 #30
0
    def test_summary2_int2B(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100000, 1, 'B.hex', 2533255332, 2633256000,   ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) 
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            if expected[0]:
                self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # apparently we can't estimate any more
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0 
예제 #31
0
파일: h2o_cmd.py 프로젝트: milzod/h2o-dev
def runSummary(node=None, key=None, expected=None, column=None, **kwargs):
    if not key: raise Exception('No key for Summary')
    if not node: node = h2o_nodes.nodes[0]
    # return node.summary(key, **kwargs)

    class Column(object):
        def __init__(self, column):
            assert isinstance(column, dict)
            for k,v in column.iteritems():
                setattr(self, k, v) # achieves self.k = v

        def __iter__(self):
            for attr, value in self.__dict__.iteritems():
                yield attr, value

    inspect = runInspect(key=key)
    # change missingList definition: None if all empty, otherwise align to cols. 0 if 0?
    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

    # doesn't take indices? only column labels?
    lastChecksum = None
    # return first column, unless specified
    desiredResult = None
    for label in labelList:
        print "doing summary on %s" % label
        summaryResult = node.summary(key=key, column=label)
        if not desiredResult or (column and column==label):
            desiredResult = summaryResult
        
        verboseprint("column", column, "summaryResult:", dump_json(summaryResult))

        # this should be the same for all the cols? Or does the checksum change?
        frame = summaryResult['frames'][0]
        default_pctiles = frame['default_pctiles']
        checksum = frame['checksum']
        rows = frame['rows']
        columns = frame['columns']

        # assert len(columns) == numCols
        assert rows == numRows
        assert checksum !=0 and checksum is not None
        assert rows!=0 and rows is not None
        assert not frame['isText']
        # FIX! why is frame['key'] = None here?
        # assert frame['key'] == key, "%s %s" % (frame['key'], key)

        # it changes?
        # assert not lastChecksum or lastChecksum == checksum

        lastChecksum = checksum

        # only one column
        co = Column(columns[0])
        # how are enums binned. Stride of 1? (what about domain values)
        coList = [co.base, len(co.bins), len(co.data),
            co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles,
            co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]

        # for c in coList:
        #    print c
        for k,v in co:
            print k, v

        print "len(co.bins):", len(co.bins)
        print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
        # what is precision. -1?
        print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

        print "FIX! hacking the co.pctiles because it's short by two"
        
        if co.pctiles:
            pctiles = [0] + co.pctiles + [0]
        else:
            pctiles = None

        # the thresholds h2o used, should match what we expected
        if expected ==None:
            expected = [0] * 5
        # Fix. doesn't check for expected = 0?
        if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, 
            msg='min is not approx. expected')
        if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, 
            msg='25th percentile is not approx. expected')
        if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, 
            msg='50th percentile (median) is not approx. expected')
        if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, 
            msg='75th percentile is not approx. expected')
        if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, 
            msg='max is not approx. expected')

        # figure out the expected max error
        # use this for comparing to sklearn/sort
        MAX_QBINS = 1000
        if expected[0] and expected[4]:
            expectedRange = expected[4] - expected[0]
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange/(MAX_QBINS-2)
            maxErr = expectedBin # should we have some fuzz for fp?

        else:
            print "Test won't calculate max expected error"
            maxErr = 0

        pt = h2o_util.twoDecimals(pctiles)
        mx = h2o_util.twoDecimals(co.maxs)
        mn = h2o_util.twoDecimals(co.mins)

        print "co.label:", co.label, "co.pctiles (2 places):", pt
        print "default_pctiles:", default_pctiles
        print "co.label:", co.label, "co.maxs: (2 places):", mx
        print "co.label:", co.label, "co.mins: (2 places):", mn

        # FIX! why would pctiles be None? enums?
        if pt is None:
            compareActual = mn[0], [None] * 3, mx[0]
        else:
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]

        h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
        h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected)

    return desiredResult
예제 #32
0
    def test_summary2_uniform_w_NA(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0,
                                          20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                          -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                       1.00)),
            (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                        100.0)),
            (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                          7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                           100, 00)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                           75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28,
                                           100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               noPrint=False,
                                               max_qbins=MAX_QBINS,
                                               numRows=numRows,
                                               numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1 + NA_ROW_RATIO) * rowCount,
                             numRows,
                             msg="numRows %s should be %s" %
                             (numRows, (1 + NA_ROW_RATIO) * rowCount))

            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?

                e = rowCount / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b,
                                       e,
                                       delta=2 * e,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
예제 #33
0
def import_only(node=None, schema='local', bucket=None, path=None,
    timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, 
    importParentDir=True, **kwargs):

    if src_key and schema!='put':
        raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern)  = ("", path)

    h2o.verboseprint("head:", head)
    h2o.verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
           raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
           raise Exception("h2o path %s can't be regex. path= was %s" % (head, path))

    if schema=='put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
           raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path: 
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) 
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")
    
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)
        return (None, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

          
        folderURI = 'nfs:/' + folderPath
        if importParentDir:
            importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs)
        else:
            importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')
    
        # strip leading / in head if present
        if bucket and head!="":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o.nodes[0]
        if schema=='s3' or node.redirect_import_folder_to_s3_path:
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"
            importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs)

        elif schema=='s3n' or node.redirect_import_folder_to_s3n_path:
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema=='maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema=='hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        else: 
            raise Exception("schema not understood: %s" % schema)

    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
예제 #34
0
파일: h2o_import.py 프로젝트: hihihippp/h2o
def import_only(node=None, schema='local', bucket=None, path=None,
    timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, **kwargs):

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern)  = ("", path)

    h2o.verboseprint("head:", head)
    h2o.verboseprint("pattern:", pattern)

    # to train users / okay here
    if re.search(r"[\*<>{}[\]~`]", head):
       raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))

    if schema=='put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
           raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path: 
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) 
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")
    
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)
        return (None, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        folderURI = 'nfs:/' + folderPath
        importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')
    
        # strip leading / in head if present
        if bucket and head!="":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o.nodes[0]
        if schema=='s3' or node.redirect_import_folder_to_s3_path:
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"
            importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs)

        elif schema=='s3n' or node.redirect_import_folder_to_s3n_path:
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)

        elif schema=='maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "maprfs:///" + folderOffset
            importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)

        elif schema=='hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)

        else: 
            raise Exception("schema not understood: %s" % schema)

    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
예제 #35
0
    def test_summary2_NY0(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100, 200, 'x.hex', choicesList[4], expectedList[4]),
            (100, 200, 'x.hex', choicesList[5], expectedList[5]),
            (100, 200, 'x.hex', choicesList[6], expectedList[6]),
            (100, 200, 'x.hex', choicesList[7], expectedList[7]),
            (100, 200, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 200, 'x.hex', choicesList[2], expectedList[2]),
            (10000, 200, 'x.hex', choicesList[1], expectedList[1]),
            (100000, 200, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?
        
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=10, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(colCount):
                # walks across the columns triggering a summary on the col desired
                # runSummary returns a column object now. inspect and parse don't. They return json.
                # maybe eventually will make them return object? But I also pass expected stuff to them
                # should I pass expected to summary? no, more complex?
                co = h2o_cmd.runSummary(key=hex_key, column=i)
                print co.label, co.type, co.missing_count, co.domain, sum(co.histogram_bins)

                print "\nComparing column %s to expected" % i
                self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \
                    (i, expectedNaCnt[i], co.missing_count))
                self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins))

            h2p.green_print("\nDone with trial", trial)
            trial += 1

            h2i.delete_keys_at_all_nodes()
예제 #36
0
def do_h2o_glm(self, bucket, csvPathname, L, family='binomial'):

    h2p.red_print("\nNow doing h2o")
    parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', timeoutSecs=180)
    # save the resolved pathname for use in the sklearn csv read below

    inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
    print inspect
    print "\n" + csvPathname, \
        "    numRows:", "{:,}".format(inspect['numRows']), \
        "    numCols:", "{:,}".format(inspect['numCols'])

    x         = 'ID'
    y         = 'CAPSULE'
    family    = family
    alpha     = '0'
    lambda_   = L
    nfolds    = '0'
    f         = 'prostate'
    modelKey  = 'GLM_' + f

    kwargs = {
        'response'           : y,
        'ignored_cols'       : x,
        'family'             : family,
        'lambda'             : lambda_,
        'alpha'              : alpha,
        'n_folds'            : nfolds, # passes if 0, fails otherwise
        'destination_key'    : modelKey,
    }

    timeoutSecs = 60
    start = time.time()
    glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

    # this stuff was left over from when we got the result after polling the jobs list
    # okay to do it again
    # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling)
    (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs)
    cstring = "".join([("%.5e  " % c) for c in clist])
    h2p.green_print("h2o alpha ", alpha)
    h2p.green_print("h2o lambda ", lambda_)
    h2p.green_print("h2o coefficient list:", cstring)
    h2p.green_print("h2o intercept", "%.5e  " %  intercept)

    # other stuff in the json response
    glm_model = glmResult['glm_model']
    _names = glm_model['_names']
    coefficients_names = glm_model['coefficients_names']

    # the first submodel is the right one, if onely one lambda is provided as a parameter above
    submodels = glm_model['submodels'][0]

    beta = submodels['beta']
    h2p.red_print("beta:", beta)
    norm_beta = submodels['norm_beta']
    iteration = submodels['iteration']

    validation = submodels['validation']
    auc = validation['auc']
    aic = validation['aic']
    null_deviance = validation['null_deviance']
    residual_deviance = validation['residual_deviance']

    print '_names', _names
    print 'coefficients_names', coefficients_names
    # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
    print 'beta', beta
    print 'iteration', iteration
    print 'auc', auc
예제 #37
0
파일: h2o_cmd.py 프로젝트: JMR-b/h2o-dev
def runSummary(node=None, key=None, column=None, expected=None, maxDelta=None, noPrint=False, **kwargs):
    if not key: raise Exception('No key for Summary')
    if not node: node = h2o_nodes.nodes[0]
    # return node.summary(key, **kwargs)

    i = InspectObj(key=key)
    # just so I don't have to change names below
    missingList = i.missingList
    labelList = i.labelList
    numRows = i.numRows
    numCols = i.numCols

    # doesn't take indices? only column labels?
    # return first column, unless specified

    if not (column is None or isinstance(column, (basestring, int))):
        raise Exception("column param should be string or integer index or None %s %s" % (type(column), column))

    # either return the first col, or the col indentified by label. the column identifed could be string or index?
    if column is None: # means the summary json when we ask for col 0, will be what we return (do all though)
        colNameToDo = labelList
        colIndexToDo = range(len(labelList))
    elif isinstance(column, int):
        colNameToDo = [labelList[column]]
        colIndexToDo = [column]
    elif isinstance(column, basestring):
        colNameToDo = [column]
        colIndexToDo = [labelList.index[column]]
    else:
        raise Exception("wrong type %s for column %s" % (type(column), column))

    # we get the first column as result after walking across all, if no column parameter
    desiredResult = None
    for (colIndex, colName) in zip(colIndexToDo, colNameToDo):
        print "doing summary on %s %s" % (colIndex, colName)
        # ugly looking up the colIndex
        co = SummaryObj(key=key, colIndex=colIndex, colName=colName)
        if not desiredResult:
            desiredResult = co

        if not noPrint:
            for k,v in co:
                # only print [0] of mins and maxs because of the e308 values when they don't have dataset values
                if k=='mins' or k=='maxs':
                    print "%s[0]" % k, v[0]
                else:
                    print k, v

        if expected is not None:
            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            
            if co.pctiles:
                pctiles = [0] + co.pctiles + [0]
            else:
                pctiles = None

            # the thresholds h2o used, should match what we expected
                # expected = [0] * 5
            # Fix. doesn't check for expected = 0?

            # max of one bin
            if maxDelta is None:
                maxDelta = (co.maxs[0] - co.mins[0])/1000

            if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, 
                msg='min is not approx. expected')
            if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, 
                msg='25th percentile is not approx. expected')
            if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, 
                msg='50th percentile (median) is not approx. expected')
            if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, 
                msg='75th percentile is not approx. expected')
            if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, 
                msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            MAX_QBINS = 1000
            if expected[0] and expected[4]:
                expectedRange = expected[4] - expected[0]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(pctiles)

            # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column
            mx = h2o_util.twoDecimals(co.maxs[0])
            mn = h2o_util.twoDecimals(co.mins[0])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "co.default_pctiles:", co.default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! why would pctiles be None? enums?
            if pt is None:
                compareActual = mn, [None] * 3, mx
            else:
                compareActual = mn, pt[3], pt[5], pt[7], mx

            h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
            h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected)

    return desiredResult
예제 #38
0
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0,
                                            15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                            -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                   1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                         1.00)),
            (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                          100.0)),
            (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                            7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                             100, 00)),
            (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                             75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18,
                                             49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999
            ]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount,
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i != 0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (
                        hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" %
                                    h2o.dump_json(resultExec))
                    h2p.blue_print(
                        "\nthreshold: %.2f Exec quantile: %s Summary2: %s" %
                        (threshold, result, pt[i]))
                    if not result:
                        raise Exception(
                            "exec result: %s for quantile: %s is bad" %
                            (result, threshold))
                    h2o_util.assertApproxEqual(
                        result,
                        pctile[i],
                        tol=maxDelta,
                        msg=
                        'exec percentile: %s too different from expected: %s' %
                        (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1 == 0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2')
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols, 1)
                    self.assertEqual(numRows, len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                )

            h2o.nodes[0].remove_all_keys()
예제 #39
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5*ROWS, 1, 'x.hex', 1, 20000,        ['C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5*ROWS, 1, 'x.hex', -5000, 0,        ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1*ROWS, 1, 'x.hex', -1, 1,           ['C1',  -1.05, -0.48, 0.0087, 0.50, 1.00]),

            (1*ROWS, 1, 'A.hex', 1, 100,          ['C1',   1.05, 26.00, 51.00, 76.00, 100.0]),
            (1*ROWS, 1, 'A.hex', -99, 99,         ['C1',  -99, -50.0, 0, 50.00, 99]),

            (1*ROWS, 1, 'B.hex', 1, 10000,        ['C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1*ROWS, 1, 'B.hex', -100, 100,       ['C1',  -100.10, -50.0, 0.85, 51.7, 100,00]),

            (1*ROWS, 1, 'C.hex', 1, 100000,       ['C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1*ROWS, 1, 'C.hex', -101, 101,       ['C1',  -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [ co.base, len(co.bins), len(co.data), co.domain,
                co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles,
                co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr  = "[%s]" % ",".join(map(str,probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(
                algo='quantile',
                model_id=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][0] # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                    )
            h2o.nodes[0].remove_all_keys()
예제 #40
0
    def test_summary2_uniform_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, 
                msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount))


            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                
                e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b, e, delta=2*e,
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
예제 #41
0
    def test_rand_inspect(self):
        ### h2b.browseTheCloud()

        ### h2b.browseTheCloud()
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = csvFilename + ".hex"
        print "\n" + csvPathname

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=10)
        destination_key = parseResult['destination_key']
        print csvFilename, 'parse time:', parseResult['response']['time']
        print "Parse result['destination_key']:", destination_key

        def inspect_and_check(nodeX,
                              destination_key,
                              offset,
                              view,
                              inspectOld=None):
            inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX],
                                            destination_key,
                                            offset=offset,
                                            view=view)
            if h2o.beta_features:
                pass
                # print "Inspect2:", h2o.dump_json(inspectNew)
            else:
                pass
                # print "Inspect:", h2o.dump_json(inspectNew)

            # FIX! get min/max/mean/variance for a col too?
            constantNames = [
                ('num_cols', 'numCols'),
                ('num_rows', 'numRows'),
                ('value_size_bytes', 'byteSize'),
                ('cols', 'cols'),
            ]

            colNames = [
                ('num_missing_values', 'naCnt'),
            ]

            for (i, j) in constantNames:
                # check the fields, even if you don't have a previous one to compare to
                if h2o.beta_features:
                    # hack in extra info for now, from the new names to old names
                    if not j in inspectNew:
                        raise Exception(
                            "Can't find %s, Inspect2 result should have it?" %
                            j)
                    inspectNew[i] = inspectNew[j]

                # don't compare if cols
                if inspectOld and i != 'cols':
                    if h2o.beta_features and i == 'value_size_bytes':  # Inspect2 should be smaller
                        self.assertGreater(inspectOld[i], inspectNew[i])

                    else:
                        # for cols it will just compare length?
                        self.assertEqual(inspectOld[i], inspectNew[i])

                if i == 'cols':
                    for (m, n) in colNames:
                        if h2o.beta_features:
                            if not n in inspectNew[i][0]:
                                print h2o.dump_json(inspectNew[i][0])
                                raise Exception(
                                    "Can't find %s, Inspect2 result['cols'][0] should have it?"
                                    % n)
                            inspectNew[i][0][m] = inspectNew[i][0][n]
                        # just compare 0
                        if inspectOld is not None:
                            self.assertEqual(inspectOld[i][0][m],
                                             inspectNew[i][0][m])

            return inspectNew

        # going to use this to compare against future. num_rows/num_cols should always
        # be the same, regardless of the view. just a coarse sanity check
        origInspect = inspect_and_check(0, destination_key, 0, 1, None)
        h2o.verboseprint(h2o.dump_json(origInspect))
        origStoreViewResult = h2o_cmd.runStoreView(offset=0,
                                                   view=1024,
                                                   timeoutSecs=60)

        num_rows = origInspect['num_rows']
        num_cols = origInspect['num_cols']

        lenNodes = len(h2o.nodes)
        for trial in range(10):
            h2p.green_print("\nTrial", trial)
            # we want to use the boundary conditions, so have two level of random choices
            offset = good_choices(num_rows)
            view = good_choices(num_cols)
            # randomize the node used
            nodeX = random.randint(0, lenNodes - 1)
            print "nodeX:", nodeX, "offset:", offset, "view:", view
            h2o.beta_features = False
            inspect_and_check(nodeX, destination_key, offset, view,
                              origInspect)
            print "trying Inspect2 by flipping h2o.nodes[0].beta_features"
            h2o.beta_features = True
            # delay between the two inspects...bug around not getting autoframe in storeview?
            time.sleep(1)
            inspect_and_check(nodeX, destination_key, offset, view,
                              origInspect)
            h2o.beta_features = False

            # a fvec frame should have been created in the storeView
            time.sleep(1)

            # loop looking for the autoframe to show up
            # o = len(origStoreViewResult['keys'])
            o = h2i.count_keys_at_all_nodes()
            retry = 0
            okay = False
            while retry == 0 or not okay:
                newStoreViewResult = h2o_cmd.runStoreView(offset=0,
                                                          view=1024,
                                                          timeoutSecs=60)
                ## p = len(newStoreViewResult['keys'])
                p = h2i.count_keys_at_all_nodes()
                print "number of keys in the two StoreViews, o:", o, "p:", p
                ## print "newStoreViewResult:", h2o.dump_json(newStoreViewResult)
                oOkay = {1, 2, 3, 4, 5, 6, 7, 8}
                pOkay = {1, 2, 3, 4, 5}
                print o, pOkay, p, oOkay
                if (o in oOkay) and (p in pOkay):
                    print "Good"
                    okay = True
                else:
                    print "Unexpected o,p after autoframe, looking at total keys in system: %s %s" % (
                        o, p)

                if retry == 10:
                    raise Exception(
                        "StoreView didn't get autoframe, after %s retries" %
                        retry)
                ## h2b.browseJsonHistoryAsUrlLastMatch("StoreView")

                # so he gets recreated??
                deleted = h2i.delete_keys_at_all_nodes(pattern='autoframe')
                # The autoframe key may not show up!!
                if INVISIBLE_AUTOFRAME:
                    # can be 1 or 2
                    if not (deleted == 0 or deleted == 1):
                        msg = "Should have deleted a total of 0 or 1 keys, looking at all nodes. Did %s" % deleted
                        raise Exception(msg)
                else:
                    # can be 1 or 2
                    if not (deleted == 1):
                        msg = "Should have deleted a total of 1 keys, looking at all nodes. Did %s" % deleted

                time.sleep(1)
                retry += 1
예제 #42
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else 0.999
            q = h2o.nodes[0].quantiles(
                source_key=hex_key,
                column=0,
                interpolation_type=7,
                quantile=quantile,
                max_qbins=MAX_QBINS,
                multiple_pass=2,
            )
            qresult = q["result"]
            qresult_single = q["result_single"]
            qresult_iterations = q["iterations"]
            qresult_interpolated = q["interpolated"]
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?",
            )

            # only one column
            column = summaryResult["summaries"][0]

            colname = column["colname"]

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
예제 #43
0
    def test_summary2_exp(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)),
            (100, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)),
            (1000, 1, "x.hex", -5000, 0, ("C1", None, None, None, None, None)),
            (10000, 1, "x.hex", -100000, 100000, ("C1", None, None, None, None, None)),
            (100000, 1, "x.hex", -1, 1, ("C1", None, None, None, None, None)),
            (1000000, 1, "A.hex", 1, 100, ("C1", None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(
                csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE
            )
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult["summaries"][0]
            colname = column["colname"]
            coltype = column["type"]
            nacnt = column["nacnt"]
            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
            pctile = stats["pctile"]
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != "" and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )