Exemplo n.º 1
0
def rapids(self, timeoutSecs=120, ignoreH2oError=False, **kwargs):
    # FIX! assume both of these are strings for now, not lists
    if 'ast' in kwargs and kwargs['ast'] is not None:
        assert isinstance(kwargs['ast'], basestring), "only string assumed? %s" % kwargs['ast']
    if 'funs' in kwargs and kwargs['funs'] is not None:
        assert isinstance(kwargs['funs'], basestring), "only string assumed? %s" % kwargs['funs']

    # currently runExec only does one or the other
    params_dict = {
        'ast': None,
        'funs': None,
    }

    check_params_update_kwargs(params_dict, kwargs, 'rapids', True)
    if 1==1:
        result = self.do_json_request('Rapids.json', cmd='post', timeout=timeoutSecs, postData=params_dict)
    else:
        result = self.do_json_request('Rapids.json', timeout=timeoutSecs, params=params_dict)

    verboseprint("rapids result:", dump_json(result))

    # FIX! maybe add something for ignoring conditionally?
    if 'exception' in result and result['exception'] and not ignoreH2oError:
        exception = result['exception']
        raise Exception('rapids with kwargs:\n%s\ngot exception:\n"%s"\n' % (dump_json(kwargs), exception))

    h2o_sandbox.check_sandbox_for_errors()
    return result
Exemplo n.º 2
0
    def get_redirect_url(response):
        url = None
        params = None
        # StoreView has old style, while beta_features
        if 'response_info' in response: 
            response_info = response['response_info']

            if 'redirect_url' not in response_info:
                raise Exception("Response during polling must have 'redirect_url'\n%s" % dump_json(response))

            if response_info['status'] != 'done':
                redirect_url = response_info['redirect_url']
                if redirect_url:
                    url = self.url(redirect_url)
                    params = None
                else:
                    if response_info['status'] != 'done':
                        raise Exception(
                            "'redirect_url' during polling is null but status!='done': \n%s" % dump_json(response))
        else:
            if 'response' not in response:
                raise Exception("'response' not in response.\n%s" % dump_json(response))

            if response['response']['status'] != 'done':
                if 'redirect_request' not in response['response']:
                    raise Exception("'redirect_request' not in response. \n%s" % dump_json(response))

                url = self.url(response['response']['redirect_request'])
                params = response['response']['redirect_request_args']

        return (url, params)
Exemplo n.º 3
0
def simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False):

    warnings = ''

    intercept = model.global_beta[-1]
    interceptName = model.coefficient_names[-1]

    coeffs = model.global_beta[:-1]
    coeffs_names = model.coefficient_names[:-1]

    assert len(coeffs) == (len(model.coefficient_names)-1)
    assert len(coeffs) == len(labelListUsed), "%s %s" % (coeffs, labelListUsed)
    
    # labelList still has the response column?
    # ignored columns aren't in model.names, but output response is.
    # labelListUsed has the response col removed so add 1
    assert len(model.names) == (len(labelListUsed)+1), "%s %s" % (model.names, labelList)
    assert model.threshold!=0

    print "len(coeffs)", len(coeffs)
    print  "coeffs:", coeffs

    # last one is intercept
    if interceptName != "Intercept" or abs(intercept)<1e-26:
        raise Exception("'Intercept' should be last in coefficient_names and global_beta %s %s" % (interceptName, intercept))

    y = parameters['response_column']

    cString = "\n"
    for i,c in enumerate(coeffs_names):
        cString += "%s: %.5e   " % (coeffs_names[i], coeffs[i])

    print cString
    print "\nH2O intercept:\t\t%.5e" % intercept
    print "\nTotal # of coeffs:", len(coeffs_names)

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coeffs['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" +
        "parameters:" + dump_json(parameters)
        ))

    if (not allowZeroCoeff) and (len(coeffs)>1):
        s = 0.0
        for c in coeffs:
            s += abs(float(c))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" +
            "parameters:" + dump_json(parameters)
            ))

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, coeffs, intercept)
Exemplo n.º 4
0
def checkScalarResult(resultExec, resultKey, allowEmptyResult=False, nanOkay=False):
    # make the common problems easier to debug
    verboseprint("checkScalarResult resultExec:", dump_json(resultExec))

    if 'funstr' not in resultExec:
        emsg = "checkScalarResult: 'funstr' missing"
    if 'result' not in resultExec:
        emsg = "checkScalarResult: 'result' missing"
    if 'scalar' not in resultExec:
        emsg = "checkScalarResult: 'scalar' missing"
    if 'num_cols' not in resultExec:
        emsg = "checkScalarResult: 'num_cols' missing"
    if 'num_rows' not in resultExec:
        emsg = "checkScalarResult: 'num_rows' missing"
    elif 'cols' not in resultExec:
        emsg = "checkScalarResult: 'cols' missing"
    else:
        emsg = None
        num_cols = resultExec["num_cols"]
        num_rows = resultExec["num_rows"]
        cols = resultExec["cols"]
        # print "cols:", dump_json(cols)

    if emsg:
        print "\nKey: '" + str(resultKey) + "' resultExec:\n", dump_json(resultExec)
        sys.stdout.flush()
        raise Exception("exec result (resultExec) missing what we expected. Look at json above. " + emsg)

    if (cols and (not num_rows or num_rows==0) ) and not allowEmptyResult:
        print "resultExec[0]:", dump_json(resultExec)
        raise Exception ("checkScalarResult says 'cols' exist in exec json response,"+\
            " but num_rows: %s is 0 or None. Is that an expected 'empty' key state?" % num_rows+\
            " Use 'allowEmptyResult if so.")

    # Cycle thru rows and extract all the meta-data into a dict?   
    # assume "0" and "row" keys exist for each list entry in rows
    # FIX! the key for the value can be 0 or 1 or ?? (apparently col?) Should change H2O here

    # cols may not exist..if the result was just scalar?
    if not cols:
        # just return the scalar result then
        scalar = resultExec['scalar']
        if scalar is None:
            raise Exception("both cols and scalar are null: %s %s" % (cols, scalar))
        checkForBadFP(scalar, json=resultExec, nanOkay=nanOkay)
        return scalar

    metaDict = cols[0]
    for key,value in metaDict.items():
        print "Inspect metaDict:", key, value
            
    min_value = metaDict['min']
    stype = metaDict['type']
    # if it's an enum col, it's okay for min to be NaN ..
    checkForBadFP(min_value, json=metaDict, nanOkay=nanOkay or stype=='Enum')
    return min_value
Exemplo n.º 5
0
def checkForBadFP(value, name='min_value', nanOkay=False, infOkay=False, json=None):
    # if we passed the json, dump it for debug
    if 'Infinity' in str(value) and not infOkay:
        if json:
            print dump_json(json)
        raise Exception("Infinity in inspected %s can't be good for: %s" % (str(value), name))
    if 'NaN' in str(value) and not nanOkay:
        if json:
            print dump_json(json)
        raise Exception("NaN in inspected %s can't be good for: %s" % (str(value), name))
Exemplo n.º 6
0
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs):
    '''
    Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out.
    '''
    params_dict = {}
    # merge kwargs into params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False)

    start_time = time.time()
    pollCount = 0
    while True:
        result = self.do_json_request('3/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict)
        # print 'Job: ', dump_json(result)

        if key:
            frames_result = self.frames(key=key)
            print 'frames_result for key:', key, dump_json(result)

        jobs = result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        dest_name = dest['name']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        print description, \
            "dest_name:", dest_name, \
            "\tprogress:", "%-10s" % progress, \
            "\tstatus:", "%-12s" % status, \
            "\tmsec:", msec
        
        if status=='DONE' or status=='CANCELLED' or status=='FAILED':
            h2o_sandbox.check_sandbox_for_errors()
            return result

        # what about 'CREATED'
        # FIX! what are the other legal polling statuses that we should check for?

        if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs):
            h2o_sandbox.check_sandbox_for_errors()
            emsg = "Job:", job_key, "timed out in:", timeoutSecs

            # for debug
            a = h2o.nodes[0].get_cloud()
            print "cloud.json:", dump_json(a)
            raise Exception(emsg)
            print emsg
            return None

        # check every other poll, for now
        if (pollCount % 2) == 0:
            h2o_sandbox.check_sandbox_for_errors()

        time.sleep(retryDelaySecs)
        pollCount += 1
Exemplo n.º 7
0
def import_parse(node=None, schema='local', bucket=None, path=None,
    src_key=None, hex_key=None, 
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, 
    importParentDir=True, **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.
    # if schema=='put':
    #    h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +  
    #        "\nMeans multi-machine with 'put' will fail")
    #    schema = 'local'

    if not node: node = h2o_nodes.nodes[0]
    (importResult, importPattern) = import_only(node, schema, bucket, path,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs)

    verboseprint("importPattern:", importPattern)
    verboseprint("importResult", dump_json(importResult))

    assert len(importResult['keys']) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path)
    # print "importResult:", importResult

    # get rid of parse timing in tests now
    start = time.time()
    parseResult = parse_only(node, importPattern, hex_key, importResult['keys'],
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, **kwargs)
    elapsed = time.time() - start
    print importPattern, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
    parseResult['python_elapsed'] = elapsed

    verboseprint("parseResult:", dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    # only if not noPoll. otherwise parse isn't done
    if doSummary and not noPoll:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        check_sandbox_for_errors()
        print "WARNING: not doing inspect/summary for now after parse"
        ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs)
        ## numRows = inspect['numRows']
        ## numCols = inspect['numCols']
        # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma)
        ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols)
        # for now, don't worry about error isolating summary 
    else:
        # isolate a parse from the next thing
        check_sandbox_for_errors()

    return parseResult
Exemplo n.º 8
0
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False):
    if not node:
        node = h2o_nodes.nodes[0]
    kwargs = {'ast': execExpr} 
    start = time.time()
    resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs)
    verboseprint('exec took', time.time() - start, 'seconds')
    print "exec:", dump_json(resultExec)

    # when do I get cols?

    # "result": "1.0351050710011848E-300", 
    # "scalar": 1.0351050710011848e-300, 
    # "funstr": null, 

    # "key": null, 
    # "col_names": null, 
    # "num_cols": 0, 
    # "num_rows": 0, 

    # "exception": null, 

    # echoing?
    # "string": null
    # "funs": null, 
    # "ast": "(= !x (xorsum ([ $r1 \"null\" #0) $TRUE))", 

    if 'cols' in resultExec and resultExec['cols']: # not null
        if 'funstr' in resultExec and resultExec['funstr']: # not null
            raise Exception("cols and funstr shouldn't both be in resultExec: %s" % dump_json(resultExec))
        else:
            print "Frame return"
            # if test said to look at a resultKey, it's should be in h2o k/v store
            # inspect a result key?
            # Should we get the key name from the exec return?
            if resultKey is not None:
                kwargs = {'ast': resultKey} 
                resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs)
                print "exec key result:", dump_json(resultExec)

            # handles the 1x1 data frame result. Not really interesting if bigger than 1x1?
            result = resultExec['cols'][0]['min']
        
    else: 
        if 'funstr' in resultExec and resultExec['funstr']: # not null
            print "function return"
            result = resultExec['funstr']
        else:
            print "scalar return"
            result = resultExec['scalar']
            
    return resultExec, result
Exemplo n.º 9
0
def cancelAllJobs(timeoutSecs=10, **kwargs): # I guess you could pass pattern
    # what if jobs had just been dispatched? wait until they get in the queue state correctly
    time.sleep(2)
    a = h2o_nodes.nodes[0].jobs(timeoutSecs=120)
    print "jobs():", dump_json(a)
    jobsList = a['jobs']
    for j in jobsList:
        if j['end_time'] == '':
            b = h2o_nodes.nodes[0].jobs_cancel(key=j['key'])
            print "jobs_cancel():", dump_json(b)

    # it's possible we could be in a bad state where jobs don't cancel cleanly
    pollWaitJobs(timeoutSecs=timeoutSecs, **kwargs) # wait for all the cancels to happen. If we missed one, we might timeout here.
Exemplo n.º 10
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        a_node = h2o.nodes[0]

        import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
        print dump_json(import_result)

        frames = a_node.frames(key=import_result['keys'][0], len=5)['frames']
        print dump_json(frames)

        parse_result = a_node.parse(key=import_result['keys'][0])
        hex_key = parse_result['frames'][0]['key']['name']
        verboseprint(hex_key, ":", dump_json(parse_result))
Exemplo n.º 11
0
 def test_b_algo_parameters(self):
     # for algo in ['kmeans', 'gbm', 'deeplearning', 'glm', 'word2vec', 'example', 'quantile', 'grep']:
     for algo in ["kmeans", "gbm", "deeplearning", "drf", "glm", "gbm", "pca", "naivebayes"]:
         paramResult = h2o.n0.model_builders(algo=algo)
         self.print_params(paramResult)
         mmResult = h2o.n0.model_metrics(algo=algo)
         print "mmResult", dump_json(mmResult)
Exemplo n.º 12
0
    def test_rapids_ifelse_nested(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for trial in range(2):
            for execObj, expected in zip(objList, resultList):
                freshObj = copy(execObj)
                result = freshObj.do()
                # do some scalar result checking
                if expected is not None:
                    # result is a string now??
                    print "result:", result
                    print "expected:", expected
                    assert float(result)==expected, "%s %s" (result,expected)

                # rows might be zero!
                print "freshObj:", dump_json(freshObj.execResult)
                if 'key' in freshObj.execResult and freshObj.execResult['key']:
                    keys.append(freshObj.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 13
0
    def test_parse_rand_utf8(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        print "HACK: reduce rows to 10 for debug"
        tryList = [
            # do two cols to detect bad eol behavior
            (10, 2, 'cA', 120),
            (10, 2, 'cG', 120),
            (10, 2, 'cH', 120),
            ]

        print "What about messages to log (INFO) about unmatched quotes (before eol)"
        # got this ..trying to avoid for now
        # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "parseResult:", dump_json(parseResult)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

            assert len(missingList) == 0
            # FIX! check type?
        
            # print "inspect:", h2o.dump_json(inspect)
            self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount))
            self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
Exemplo n.º 14
0
def showGBMGridResults(GBMResult, expectedErrorMax, classification=True):
    # print "GBMResult:", dump_json(GBMResult)
    jobs = GBMResult['jobs']
    print "GBM jobs:", jobs
    for jobnum, j in enumerate(jobs):
        _distribution = j['_distribution']
        model_key = j['destination_key']
        job_key = j['job_key']
        # inspect = h2o_cmd.runInspect(key=model_key)
        # print "jobnum:", jobnum, dump_json(inspect)
        gbmTrainView = h2o_cmd.runGBMView(model_key=model_key)
        print "jobnum:", jobnum, dump_json(gbmTrainView)

        if classification:
            cms = gbmTrainView['gbm_model']['cms']
            cm = cms[-1]['_arr'] # take the last one
            print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr']
            print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr']

            pctWrongTrain = pp_cm_summary(cm);
            if pctWrongTrain > expectedErrorMax:
                raise Exception("Should have < %s error here. pctWrongTrain: %s" % (expectedErrorMax, pctWrongTrain))

            errsLast = gbmTrainView['gbm_model']['errs'][-1]
            print "\nTrain", jobnum, job_key, "\n==========\n", "pctWrongTrain:", pctWrongTrain, "errsLast:", errsLast
            print "GBM 'errsLast'", errsLast
            print pp_cm(cm)
        else:
            print "\nTrain", jobnum, job_key, "\n==========\n", "errsLast:", errsLast
            print "GBMTrainView errs:", gbmTrainView['gbm_model']['errs']
Exemplo n.º 15
0
    def test_rapids_basic_with_funs_noinc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for i in range(100):
            if i==0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon {v} %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v2 (+ %v2 #1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15)


            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 16
0
    def test_xl_oobe(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1') # knon_* key
        b = DF('b1')
        c = DF('c1')
        # look at our secret stash in the base class. Should see the DFInit?

        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)
        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        Assign(a, 0)
        Assign(b, 0)
        Assign(c, 0)
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        print "Referring to non-existent rows causes a problem (AAIOBE)"
        Assign(c[1], (a[2] + b[2]))
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ $c1 #1 #1) (+ ([ $a1 #2 #2) ([ $b1 #2 #2)))"
        assert ast==astExpected, "Actual: %s    Expected: %s" % (ast, astExpected)

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Exemplo n.º 17
0
def compute_model_metrics(self, model, frame, timeoutSecs=60, **kwargs):
    """
    Score a model on the h2o cluster on the given Frame and return only the model metrics. 
    """
    assert model is not None, '"model" parameter is null'
    assert frame is not None, '"frame" parameter is null'

    models = self.models(key=model, timeoutSecs=timeoutSecs)
    assert models is not None, "/Models REST call failed"
    assert (
        models["models"][0]["model_id"]["name"] == model
    ), "/Models/{0} returned Model {1} rather than Model {2}".format(model, models["models"][0]["key"]["name"], model)

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(frame)

    print "frames:", dump_json(frames)
    # is the name not there?
    # assert frames['frames'][0]['model_id']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, models['models'][0]['key']['name'], frame)

    result = self.do_json_request(
        "/3/ModelMetrics.json/models/" + model + "/frames/" + frame, cmd="post", timeout=timeoutSecs
    )

    mm = result["model_metrics"][0]
    verboseprint("model metrics: " + repr(mm))
    h2o_sandbox.check_sandbox_for_errors()
    return mm
Exemplo n.º 18
0
def runInspect(node=None, key=None, timeoutSecs=30, verbose=False, **kwargs):
    if not key: raise Exception('No key for Inspect')
    if not node: node = h2o_nodes.nodes[0]
    a = node.inspect(key, timeoutSecs=timeoutSecs, **kwargs)
    if verbose:
        print "inspect of %s:" % key, dump_json(a)
    return a
Exemplo n.º 19
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        a_node = h2o.nodes[0]

        # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
        import_result = a_node.import_files(path=find_file("smalldata/poker/poker-hand-testing.data"))
        # print dump_json(import_result)

        k = import_result['keys'][0]
        # frames_result = a_node.frames(key=k[0], len=5)

        frames_result = a_node.frames(key=k)

        frame = frames_result['frames'][0]
        byteSize = frame['byteSize']
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing']
            stype = c['type']
            zeros = c['zeros']
            domain = c['domain']

        # print dump_json(frame)

        # how do you parse multiple files
        parse_result = a_node.parse(key=k)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        verboseprint(hex_key, ":", dump_json(parse_result))
Exemplo n.º 20
0
    def test_rapids_basic(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'p'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for execExpr in exprList:
            r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr)
            resultKey = r.group(1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
            if DO_ROLLUP:
                h2o_cmd.runInspect(key=resultKey)
            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr)
            else:
                h2p.yellow_print("\nNo key created?\n", dump_json(execResult))

        print "\nExpressions that created keys. Shouldn't all of these expressions create keys"

        for k in keys:
            print k

        h2o.check_sandbox_for_errors()
Exemplo n.º 21
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data")
        csvPathname = find_file("smalldata/logreg/prostate.csv")
        import_result = h2o.n0.import_files(path=csvPathname)
        # print dump_json(import_result)

        k = import_result['keys'][0]
        frames_result = h2o.n0.frames(key=k)

        frame = frames_result['frames'][0]
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing_count']
            stype = c['type']
            domain = c['domain']

        # print dump_json(frame)

        # let's see what ray's util does
        frames = h2o.n0.frames()['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        # print "frames:", dump_json(frames)
        # print "frames_dict:", dump_json(frames_dict)
        for k,v in frames_dict.items():
            print "frames_dict key:", k

        # interesting. we can do dictionary comprehensions
        # { k:v for k,v in my_dict.items() if 'Peter' in k }

        # how do you parse multiple files
        parse_result = h2o.n0.parse(key=k, intermediateResults=DO_INTERMEDIATE_RESULTS)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        colCount = 9
        rowCount = 380
        # colCount = 11
        # rowCount = 1000000
        start = time.time()
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "Inspect:", hex_key, "took", time.time() - start, "seconds"
        numCols = len(inspect['frames'][0]['columns'])
        numRows = inspect['frames'][0]['rows']
        print "\n" + csvPathname, \
            "    rows:", "{:,}".format(numRows), \
            "    len(columns):", "{:,}".format(numCols)

        # should match # of cols in header or ??
        self.assertEqual(numCols, colCount,
            "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
        self.assertEqual(numRows, rowCount,
            "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
            (numRows, rowCount))

        verboseprint(hex_key, ":", dump_json(parse_result))
Exemplo n.º 22
0
    def test(n, tries=None, timeoutSecs=14.0):
        c = n.get_cloud(noExtraErrorCheck=noExtraErrorCheck, timeoutSecs=timeoutSecs)

        # FIX! unique to h2o-dev for now, because of the port reuse problems (TCP_WAIT) compared to h2o
        # flag them early rather than after timeout
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        # don't want to check everything. But this will check that the keys are returned!
        consensus = c["consensus"]
        locked = c["locked"]
        cloud_size = c["cloud_size"]
        cloud_name = c["cloud_name"]

        if "nodes" not in c:
            emsg = "\nH2O didn't include a list of nodes in get_cloud response after initial cloud build"
            raise Exception(emsg)

        # only print it when you get consensus
        if cloud_size != node_count:
            print "\nNodes in cloud while building:"
            for i, ci in enumerate(c["nodes"]):
                # 'h2o' disappeared?
                if "h2o" not in ci:
                    print "ci:", dump_json(ci)
                    # apparently this can happen in cases where I didn't join a cloud because
                    # of a different md5 version. We'll eventually exception out?
                    # raise Exception("What happened to the 'h2o' ci dict entry?, not there")
                else:
                    print "node %s" % i, ci["h2o"]
                    ### print "node %s" % i, ci['h2o']['node']

        if cloud_size > node_count:
            emsg = (
                "\n\nERROR: cloud_size: %d reported via json is bigger than we expect: %d" % (cloud_size, node_count)
                + "\nLikely have zombie(s) with the same cloud name on the network."
                + "\nLook at the cloud IP's in 'grep Paxos sandbox/*stdout*' for some IP's you didn't expect."
                + "\n\nYou probably don't have to do anything, as the cloud shutdown in this test should"
                + "\nhave sent a Shutdown.json to all in that cloud (you'll see a kill -2 in the *stdout*)."
                + "\nIf you try again, and it still fails, go to those IPs and kill the zombie h2o's."
                + "\nIf you think you really have an intermittent cloud build, report it."
                + "\n"
                + "\nbuilding cloud size of 2 with 127.0.0.1 may temporarily report 3 incorrectly,"
                + "\nwith no zombie?"
            )
            for ci in c["nodes"]:
                emsg += "\n" + ci["h2o"]["node"]
            raise Exception(emsg)

        a = (cloud_size == node_count) and consensus
        if a:
            verboseprint("\tLocked won't happen until after keys are written")
            verboseprint("\nNodes in final cloud:")
            for ci in c["nodes"]:
                verboseprint("ci", ci)
                # this isn't in there all the time?
                # verboseprint(ci['h2o']['node'])

        return a
Exemplo n.º 23
0
def import_files(self, path, timeoutSecs=180):
    """ 
    Import a file or files into h2o.  The 'file' parameter accepts a directory or a single file.
    192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets
    """
    a = self.do_json_request("3/ImportFiles.json", timeout=timeoutSecs, params={"path": path})
    verboseprint("\nimport_files result:", dump_json(a))
    h2o_sandbox.check_sandbox_for_errors()
    return a
Exemplo n.º 24
0
def runInspect(node=None, key=None, verbose=False, **kwargs):
    if not key:
        raise Exception("No key for Inspect")
    if not node:
        node = h2o_nodes.nodes[0]
    a = node.frames(key, **kwargs)
    if verbose:
        print "inspect of %s:" % key, dump_json(a)
    return a
Exemplo n.º 25
0
def verify_cloud_size(nodeList=None, expectedCloudName=None, verbose=False,
    timeoutSecs=10, ignoreHealth=False):

    if not nodeList: nodeList = h2o_nodes.nodes
    expectedSize = len(nodeList)
    # cloud size and consensus have to reflect a single grab of information from a node.
    cloudStatus = [n.get_cloud(timeoutSecs=timeoutSecs) for n in nodeList]

    # get cloud_name from all

    cloudSizes = [c['cloud_size'] for c in cloudStatus]
    cloudConsensus = [c['consensus'] for c in cloudStatus]
    cloudHealthy = [c['cloud_healthy'] for c in cloudStatus]
    cloudName = [c['cloud_name'] for c in cloudStatus]

    if not all(cloudHealthy):
        msg = "Some node reported cloud_healthy not true: %s" % cloudHealthy
        if not ignoreHealth:
            raise Exception(msg)

    # gather up all the node_healthy status too
    for i, c in enumerate(cloudStatus):
        nodesHealthy = [n['node_healthy'] for n in c['nodes']]
        if not all(nodesHealthy):
            print "node %s cloud status: %s" % (i, dump_json(c))
            msg = "node %s says some node is not reporting node_healthy: %s" % (c['node_name'], nodesHealthy)
            if not ignoreHealth:
                raise Exception(msg)

    if expectedSize == 0 or len(cloudSizes) == 0 or len(cloudConsensus) == 0:
        print "\nexpectedSize:", expectedSize
        print "cloudSizes:", cloudSizes
        print "cloudConsensus:", cloudConsensus
        raise Exception("Nothing in cloud. Can't verify size")

    for s in cloudSizes:
        consensusStr = (",".join(map(str, cloudConsensus)))
        sizeStr = (",".join(map(str, cloudSizes)))
        if (s != expectedSize):
            raise Exception("Inconsistent cloud size." +
               "nodeList report size: %s consensus: %s instead of %d." % \
               (sizeStr, consensusStr, expectedSize))

    # check that all cloud_names are right
    if expectedCloudName:
        for i, cn in enumerate(cloudName):
            if cn != expectedCloudName:
                # tear everyone down, in case of zombies. so we don't have to kill -9 manually
                print "node %s has the wrong cloud name: %s expectedCloudName: %s." % (i, cn, expectedCloudName)
                # print "node %s cloud status: %s" % (i, dump_json(cloudStatus[i]))
                print "tearing cloud down"
                tear_down_cloud(nodeList=nodeList, sandboxIgnoreErrors=False)
                raise Exception("node %s has the wrong cloud name: %s expectedCloudName: %s" % \
                    (i, cn, expectedCloudName))

    return (sizeStr, consensusStr, expectedSize)
Exemplo n.º 26
0
def typeahead(self, timeoutSecs=10, **kwargs):
    params_dict = {
        'src': None,
        'limit': None,
    }
    check_params_update_kwargs(params_dict, kwargs, 'typeahead', print_params=True)
    # odd ...needs /files
    a = self.do_json_request('3/Typeahead.json/files', params=params_dict, timeout=timeoutSecs)
    verboseprint("\ntypeahead result:", dump_json(a))
    return a
Exemplo n.º 27
0
    def test_xl_seq_A(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1') # knon_* key
        b = DF('b1')
        c = DF('c1')
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)
        # look at our secret stash in the base class. Should see the DFInit?

        # DF does a kv store init. Key doesn't
        # DF inherits from Key. KeyIndexed inherits from Key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)

        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        Assign(a, 0)
        checkAst("(= !a1 #0)")
        Assign(b, 0)
        checkAst("(= !b1 #0)")
        Assign(c, 0)
        checkAst("(= !c1 #0)")

        Assign(a, [0])
        checkAst("(= !a1 (c {#0}))")
        Assign(b, [0,1])
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, [0,1,2])
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(a, (0,)) # make sure it's a tuple with comma
        checkAst("(= !a1 (c {#0}))")
        Assign(b, (0,1))
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, (0,1,2))
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(c, a[0] + b[1])
        checkAst("(= !c1 (+ ([ $a1 #0 #0) ([ $b1 #1 #0)))")

        Assign(c[0], (a[0] + b[1]))
        checkAst("(= ([ $c1 #0 #0) (+ ([ $a1 #0 #0) ([ $b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Exemplo n.º 28
0
    def test_rapids_ddply_with_funs(self):
        if 1==0:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'
        else:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        # get rid of the enum response cole
        execExpr2 = '(= !r2 ([ $r1 "null" {#0;#1;#2;#3}))'
        execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15)

        keys = []
        for execExpr1 in initList:
            # ddply function can only return one row. Just use expressions above as nose
            # some of the expressions above use $v, but v won't be created as key outside any more with ddply
            funs = '[(def anon {v} %s;;(sum $v $TRUE);;;)]' % execExpr1
            execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5)

            execExpr2 = '(h2o.ddply $r2 {#2;#3} $anon)'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=120)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr1)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 29
0
def jobs_admin(self, timeoutSecs=120, **kwargs):
    params_dict = {
        # 'expression': None,
    }
    params_dict.update(kwargs)
    verboseprint("\njobs_admin:", params_dict)
    a = self.do_json_request('Jobs.json', timeout=timeoutSecs, params=params_dict)
    verboseprint("\njobs_admin result:", dump_json(a))
    # print "WARNING: faking jobs admin"
    # a = { 'jobs': {} }
    return a
Exemplo n.º 30
0
def import_files(self, path, timeoutSecs=180):
    ''' 
    Import a file or files into h2o.  The 'file' parameter accepts a directory or a single file.
    192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets
    '''
    a = self.do_json_request('2/ImportFiles.json',
        timeout=timeoutSecs,
        params={"path": path}
    )
    verboseprint("\nimport_files result:", dump_json(a))
    return a
Exemplo n.º 31
0
    def test_rapids_basic_with_funs_inc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for i in range(2):
            if i==0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0;#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon { v } %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v1 (anon %v1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey=None, timeoutSecs=5)

            print "result:", result

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 32
0
    def test_DL_covtype(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'standard/covtype.data'
        csvPathname_test = 'standard/covtype.data'
        hex_key = 'covtype.hex'
        validation_key = 'covtype_v.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False,
                                       columnTypeDict={54: 'Enum'})
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)
        pV = h2o_cmd.ParseObj(parseResult)
        iV = h2o_cmd.InspectObj(pV.parse_key)
        parse_keyV = pV.parse_key
        numRowsV = iV.numRows
        numColsV = iV.numCols
        labelListV = iV.labelList

        response = numCols - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            # loss enum True None [u'MeanSquare', u'CrossEntropy']
            'loss': 'CrossEntropy',
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': None,  # string[] None
            'response_column': labelList[response],  # string None
            'balance_classes': None,  # boolean false
            'max_after_balance_size': None,  # float Infinity
            'keep_cross_validation_splits': None,  # boolean false
            'checkpoint': None,  # Key None
            'overwrite_with_best_model': None,  # boolean true
            'expert_mode': None,  # boolean false
            'autoencoder': None,  # boolean false
            # 'use_all_factor_levels': None, # boolean true
            # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout']
            'activation': 'Tanh',  # enum Rectifier 
            'hidden': '[100,100,100]',  # int[] [200, 200]
            'epochs': 0.7,  # double 10.0
            'train_samples_per_iteration': 100000,  # long -2
            'target_ratio_comm_to_comp': None,  # double 0.02
            'seed': None,  # long 1679194146842485659
            'adaptive_rate': True,  # boolean true
            'rho': None,  # double 0.99
            'epsilon': None,  # double 1.0E-8
            'rate': None,  # double 0.005
            'rate_annealing': None,  # double 1.0E-6
            'rate_decay': None,  # double 1.0
            'momentum_start': None,  # double 0.0
            'momentum_ramp': None,  # double 1000000.0
            'momentum_stable': None,  # double 0.0
            'nesterov_accelerated_gradient': None,  # boolean true
            'input_dropout_ratio': 0.0,  # double 0.0
            'hidden_dropout_ratios': None,  # double[] None
            'l1': 1e-5,  # double 0.0
            'l2': 1e-7,  # double 0.0
            'max_w2': None,  # float Infinity
            'initial_weight_distribution':
            None,  # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal']
            'initial_weight_scale': None,  # double 1.0
            'loss':
            'CrossEntropy',  # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy']
            'score_interval': None,  # double 5.0
            'score_training_samples': None,  # long 10000
            'score_validation_samples': None,  # long 0
            'score_duty_cycle': None,  # double 0.1
            'classification_stop': None,  # double 0.0
            'regression_stop': None,  # double 1.0E-6
            'quiet_mode': None,  # boolean false
            'max_confusion_matrix_size': None,  # int 20
            'max_hit_ratio_k': None,  # int 10
            'balance_classes': None,  # boolean false
            'class_sampling_factors': None,  # float[] None
            'max_after_balance_size': None,  # float Infinity
            'score_validation_sampling':
            None,  # enum Uniform [u'Uniform', u'Stratified']
            'diagnostics': None,  # boolean true
            'variable_importances': None,  # boolean false
            'fast_mode': None,  # boolean true
            'ignore_const_cols': None,  # boolean true
            'force_load_balance': None,  # boolean true
            'replicate_training_data': None,  # boolean false
            'single_node_mode': None,  # boolean false
            'shuffle_training_data': None,  # boolean false
            'missing_values_handling':
            None,  # enum MeanImputation [u'Skip', u'MeanImputation']
            'sparse': None,  # boolean false
            'col_major': None,  # boolean false
            'average_activation': None,  # double 0.0
            'sparsity_beta': None,  # double 0.0
        }
        expectedErr = 0.20  ## expected validation error for the above model
        relTol = 0.20  ## 15% rel. error tolerance due to Hogwild!

        timeoutSecs = 300
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      model_id=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        ## h2o_cmd.runStoreView()

        # FIX! should be the scored error
        print "model", dump_json(model)
        actualErr = model['validation_metrics']['MSE']
        print "expected error: " + format(expectedErr)
        print "actual   error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "error of %s is not within %s %% relative error of %s" %
                (actualErr, float(relTol) * 100, expectedErr))
Exemplo n.º 33
0
def oldSimpleCheckGLM(self,
                      glm,
                      colX,
                      allowFailWarning=False,
                      allowZeroCoeff=False,
                      prettyPrint=False,
                      noPrint=False,
                      maxExpectedIterations=None,
                      doNormalized=False,
                      **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors.
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    GLMModel = glm['glm_model']
    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" %
                        dump_json(glm))

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x, w) and not allowFailWarning:
                if re.search(c, w):
                    # ignore the fail to converge warning now
                    pass
                else:
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    GLMParams = GLMModel['glm']
    family = GLMParams["family"]

    # number of submodels = number of lambda
    # min of 2. lambda_max is first
    submodels = GLMModel['submodels']
    # since all our tests?? only use one lambda, the best_lamda_idx should = 1
    best_lambda_idx = GLMModel['best_lambda_idx']
    print "best_lambda_idx:", best_lambda_idx
    lambda_max = GLMModel['lambda_max']
    print "lambda_max:", lambda_max

    # currently lambda_max is not set by tomas. ..i.e.not valid
    if 1 == 0 and (lambda_max <= submodels[best_lambda_idx].lambda_value):
        raise Exception(
            "lambda_max %s should always be > the lambda result %s we're checking"
            % (lambda_max, submodels[best_lambda_idx].lambda_value))

    # submodels0 = submodels[0]
    # submodels1 = submodels[-1] # hackery to make it work when there's just one

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception(
            "best_lambda_idx: %s should point to one of lambdas (which has len %s)"
            % (best_lambda_idx, len(submodels)))

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception(
            "best_lambda_idx: %s should point to one of submodels (which has len %s)"
            % (best_lambda_idx, len(submodels)))

    submodels1 = submodels[
        best_lambda_idx]  # hackery to make it work when there's just one
    iterations = submodels1['iteration']

    print "GLMModel/iterations:", iterations

    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations > maxExpectedIterations:
        raise Exception(
            "Convergence issue? GLM did iterations: %d which is greater than expected: %d"
            % (iterations, maxExpectedIterations))

    if 'validation' not in submodels1:
        raise Exception("Should be a 'validation' key in submodels1: %s" %
                        dump_json(submodels1))
    validationsList = submodels1['validation']
    validations = validationsList

    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    print "GLMModel/validations"
    validations['null_deviance'] = h2o_util.cleanseInfNan(
        validations['null_deviance'])
    validations['residual_deviance'] = h2o_util.cleanseInfNan(
        validations['residual_deviance'])
    print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
    print "%15s %s" % ("residual_deviance:\t",
                       validations['residual_deviance'])

    # threshold only there if binomial?
    # auc only for binomial
    if family == "binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        best_threshold = validations['best_threshold']
        thresholds = validations['thresholds']
        print "%15s %s" % ("best_threshold:\t", best_threshold)

        # have to look up the index for the cm, from the thresholds list
        best_index = None

        for i, t in enumerate(thresholds):
            if t >= best_threshold:  # ends up using next one if not present
                best_index = i
                break

        assert best_index != None, "%s %s" % (best_threshold, thresholds)
        print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

        # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
        submodels = glm['glm_model']['submodels']
        # FIX! this isn't right if we have multiple lambdas? different submodels?
        cms = submodels[0]['validation']['_cms']
        self.assertEqual(
            len(thresholds),
            len(cms),
            msg="thresholds %s and cm %s should be lists of the same size. %s"
            % (len(thresholds), len(cms), thresholds))
        # FIX! best_threshold isn't necessarily in the list. jump out if >=
        assert best_index < len(cms), "%s %s" % (best_index, len(cms))
        # if we want 0.5..rounds to int
        # mid = len(cms)/2
        # cm = cms[mid]
        cm = cms[best_index]

        print "cm:", dump_json(cm['_arr'])
        predErr = cm['_predErr']
        classErr = cm['_classErr']
        # compare to predErr
        # pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
        # FIX!
        pctWrong = 0
        print "predErr:", predErr
        print "calculated pctWrong from cm:", pctWrong
        print "classErr:", classErr

        # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

        print "\nTrain\n==========\n"
        # print h2o_gbm.pp_cm(cm['_arr'])

    if family == "poisson" or family == "gaussian":
        print "%15s %s" % ("AIC:\t", validations['AIC'])

    coefficients_names = GLMModel['coefficients_names']
    # print "coefficients_names:", coefficients_names
    idxs = submodels1['idxs']
    print "idxs:", idxs
    coefficients_names = coefficients_names

    # always check both normalized and normal coefficients
    norm_beta = submodels1['norm_beta']
    # if norm_beta and len(coefficients_names)!=len(norm_beta):
    #    print len(coefficients_names), len(norm_beta)
    #    raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta))
    #
    beta = submodels1['beta']
    # print "beta:", beta
    # if len(coefficients_names)!=len(beta):
    #    print len(coefficients_names), len(beta)
    #    raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta))

    # test wants to use normalized?
    if doNormalized:
        beta_used = norm_beta
    else:
        beta_used = beta

    coefficients = {}
    # create a dictionary with name, beta (including intercept) just like v1

    for i, b in zip(idxs, beta_used[:-1]):
        name = coefficients_names[i]
        coefficients[name] = b

    print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
    print "coefficients:", coefficients
    print "beta:", beta
    print "norm_beta:", norm_beta

    coefficients['Intercept'] = beta_used[-1]
    print "len(coefficients_names)", len(coefficients_names)
    print "len(idxs)", len(idxs)
    print "idxs[-1]", idxs[-1]
    print "intercept demapping info:", \
        "coefficients_names[-i]:", coefficients_names[-1], \
        "idxs[-1]:", idxs[-1], \
        "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
        "beta_used[-1]:", beta_used[-1], \
        "coefficients['Intercept']", coefficients['Intercept']

    # last one is intercept
    interceptName = coefficients_names[idxs[-1]]
    if interceptName != "Intercept" or abs(beta_used[-1]) < 1e-26:
        raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\
            (idxs[-1], beta_used[-1], "-"+interceptName+"-"))

    # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names
    # new 5/28/14. glm can point to zero coefficients
    # for i in idxs:
    #     if beta_used[i]==0.0:
    ##        raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
    if len(idxs) > len(beta_used):
        raise Exception("idxs shouldn't be longer than beta_used %s %s" %
                        (len(idxs), len(beta_used)))
    intercept = coefficients.pop('Intercept', None)

    # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
    # the last one shoudl be 'Intercept' ?
    coefficients_names.pop()

    # have to skip the output col! get it from kwargs
    # better always be there!
    y = kwargs['response']

    # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names'
    # from the response
    # Tomas created 'coefficients_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before
    # he gets it?
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint:
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # coefficients_names is input only now..same for header or no header, or expanded enums
    for c in coefficients_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint:
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(coefficients_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        # add kwargs to help debug without looking at console log
        self.assertGreater(
            absXCoeff, 1e-26,
            ("abs. value of GLM coefficients['" + str(colX) + "'] is " +
             str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) + "\n" +
             "kwargs:" + dump_json(kwargs)))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26,
                       ("abs. value of GLM coefficients['Intercept'] is " +
                        str(absIntercept) + ", not >= 1e-26 for Intercept" +
                        "\n" + "kwargs:" + dump_json(kwargs)))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients) > 0):
        maxKey = max([(abs(coefficients[x]), x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[
            maxKey]
        minKey = min([(abs(coefficients[x]), x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[
            minKey]
    else:
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero,
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients) > 1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(
            s, 1e-26,
            ("sum of abs. value of GLM coefficients/intercept is " + str(s) +
             ", not >= 1e-26\n" + "kwargs:" + dump_json(kwargs)))

    print "submodels1, run_time (milliseconds):", submodels1['run_time']

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, cList, intercept)
Exemplo n.º 34
0
def simpleCheckGLM(self,
                   model,
                   parameters,
                   labelList,
                   labelListUsed,
                   allowFailWarning=False,
                   allowZeroCoeff=False,
                   prettyPrint=False,
                   noPrint=False,
                   maxExpectedIterations=None,
                   doNormalized=False,
                   allowNaN=False):

    # FIX! the structure is all different
    return

    warnings = ''
    # binomial = model.binomial
    residual_deviance = model.training_metrics.residual_deviance

    threshold = model.training_metrics.threshold
    check_obj_has_good_numbers(threshold, 'threshold', allowNaN=allowNaN)

    auc = model.AUC
    # NaN if not logistic
    # check_obj_has_good_numbers(auc, 'model.AUC')

    best_lambda_idx = model.best_lambda_idx
    model_category = model.model_category
    name = model.name
    residual_degrees_of_freedom = model.residual_degrees_of_freedom

    # is this no longer used?
    coefficients_magnitude = model.coefficients_magnitude

    null_deviance = model.null_deviance
    check_obj_has_good_numbers(null_deviance,
                               'model.null_deviance',
                               allowNaN=allowNaN)

    null_degrees_of_freedom = model.null_degrees_of_freedom
    check_obj_has_good_numbers(null_degrees_of_freedom,
                               'model.null_degrees_of_freedom',
                               allowNaN=allowNaN)

    domains = model.domains

    # when is is this okay to be NaN?
    AIC = model.AIC
    check_obj_has_good_numbers(AIC, 'model.AIC', allowNaN=allowNaN)

    names = model.names

    coeffs_names = model.coefficients_table.data[0]

    # these are returned as quoted strings. Turn them into numbers
    temp = model.coefficients_table.data[1]
    assert len(coeffs_names) == len(temp), "%s %s" % (len(coeffs_names),
                                                      len(temp))

    # we need coefficients to be floats or empty
    check_obj_has_good_numbers(temp, 'model.coeffs', allowNaN=False)
    # print "temp", temp[0:10]
    # print "temp[5489:5500]", temp[5489:5500]

    # UPDATE: None (null json) is legal for coeffs
    coeffs = map(lambda x: float(x)
                 if (x is not None and str(x) != "") else 0, temp)

    intercept = coeffs[-1]
    interceptName = coeffs_names[-1]
    assert interceptName == 'Intercept'

    assert len(coeffs) == len(coeffs_names), "%s %s" % (len(coeffs),
                                                        len(coeffs_names))
    # FIX! if a coeff is zeroed/ignored, it doesn't show up?
    # get rid of intercept in glm response
    # assert (len(coeffs)-1) == len(labelListUsed, \
    #    "%s %s %s %s" % (len(coeffs), len(labelListUsed), coeffs, labelListUsed)

    # labelList still has the response column?
    # ignored columns aren't in model.names, but output response is.
    # labelListUsed has the response col removed so add 1

    # Hmm..dropped coefficients again? can't do this check?
    # assert len(model.names) == len(labelListUsed), \
    #    "%s %s %s %s" % (len(model.names), len(labelListUsed), model.names, labelList)

    # this is no longer true!
    # assert model.threshold!=0

    print "len(coeffs)", len(coeffs)
    print "coeffs:", coeffs

    # last one is intercept
    if interceptName != "Intercept" or abs(intercept) < 1e-26:
        raise Exception("'Intercept' should be last in coeffs_names %s %s" %
                        (interceptName, intercept))

    y = parameters['response_column']

    cString = "\n"
    for i, c in enumerate(coeffs_names):
        cString += "%s: %.5e   " % (coeffs_names[i], coeffs[i])

    print cString
    print "\nH2O intercept:\t\t%.5e" % intercept
    print "\nTotal # of coeffs:", len(coeffs_names)

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26,
                       ("abs. value of GLM coeffs['Intercept'] is " +
                        str(absIntercept) + ", not >= 1e-26 for Intercept" +
                        "\n" + "parameters:" + dump_json(parameters)))

    if (not allowZeroCoeff) and (len(coeffs) > 1):
        s = 0.0
        for c in coeffs:
            s += abs(float(c))

        self.assertGreater(
            s, 1e-26,
            ("sum of abs. value of GLM coeffs/intercept is " + str(s) +
             ", not >= 1e-26\n" + "parameters:" + dump_json(parameters)))

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, coeffs, intercept)
Exemplo n.º 35
0
    def test_xl_ast_assert_ZZ(self):
        #*****************************************
        a = DF('a1')  # inits to -1
        checkAst(astForInit(a))
        # I suppose use of the h2o inspect request is deprecated
        # h2o_cmd.runInspect uses Frames?
        if 1 == 0:
            inspect = h2o.n0.inspect(
                key=a
            )  # str(a) becomes 'a1'. so this param should take type Key for key=
            print "a/a1:", dump_json(inspect)

        # let's use runSummary for fun..returns OutputObj for the col
        # will get from column 0, since column not specified
        summaryResult = h2o_cmd.runSummary(key=a)
        co = h2o_cmd.infoFromSummary(summaryResult)
        print "co.label:", co.label
        print "co.data:", co.data

        # how can we get a bunch of data?
        b = DF('b1')  # inits to -1
        checkAst(astForInit(b))
        c = DF('c1')  # inits to -1
        checkAst(astForInit(c))
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        h2p.yellow_print("Assign compare1")
        Assign(c[0], c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare2")
        Assign(c[0], c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare3")
        Assign(c[0], c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare4")
        Assign(c[0], c[0] != 0)
        checkAst("(= ([ %c1 #0 #0) (N ([ %c1 #0 #0) #0))")

        # h2o_xl.debugPrintEnable = True

        #*****************************************
        c = DF('c1')

        h2p.yellow_print("<<= compare1")
        c[0] <<= (c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare2")
        c[0] <<= (c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare3")
        c[0] <<= (c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        #*****************************************
        c = DF('c1')  # inits to -1
        h2p.yellow_print("compare1")
        # doesn't assign result to a key?, gets result if scalar, otherwise gets a list or ???
        # .result can give us scalar, list, Key, None

        # .result could be a property that triggers a csv download, if we didn't cache the scalar/list result because it was small?
        # i.e. check if .result_cached was None, when .result property is used (property to avoid the need for ()
        result = Expr(c[0] == -1).result
        checkAst("(n ([ %c1 #0 #0) #-1)")
        h2p.yellow_print(
            "Expr result..Desire: python datatype/value if scalar or list,.else Key: %s %s"
            % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result)  # real result?

        if result:
            print "true for if of result", type(result), result
        else:
            print "else for if of result", type(result), result

        #*****************************************
        # difference is this goes to a temp key, so if not scalar, you can still get the results by looking at the key
        result = Assign(None, c[0] == -1).result
        checkAst("(= !knon_0x1a34250 (n ([ %c1 #0 #0) #-1))")
        h2p.yellow_print(
            "Assign result..Desire: python datatype/value if scalar or list,.else Key: %s %s"
            % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result)  # real result?

        if result:
            print "true if of result", result
        else:
            print "false if of result", result
Exemplo n.º 36
0
    def test_xl_ast_assert_X(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1')
        checkAst(astForInit(a))
        b = DF('b1')
        checkAst(astForInit(b))
        c = DF('c1')
        checkAst(astForInit(c))
        # look at our secret stash in the base class. Should see the DFInit?
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        # DF does a kv store init. Key doesn't
        # DF inherits from Key. KeyIndexed inherits from Key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)

        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        Assign(a, 2)
        checkAst("(= !a1 #2)")
        Assign(b, 2)
        checkAst("(= !b1 #2)")
        Assign(c, 2)
        checkAst("(= !c1 #2)")

        # - doesn't exist? multiply by -1?
        Assign(c, ~c)
        checkAst("(= !c1 (^ %c1 #1))")  # not right if more than 1 col?
        Assign(c, -c)
        checkAst("(= !c1 (_ %c1))")
        Assign(c, abs(c))
        checkAst("(= !c1 (abs %c1))")

        # this needs to be an h2o int? because it expects int return
        # Assign(c, int(c))
        # checkAst("(= !c1 (trunc c1 ))")

        Assign(a, [0])
        checkAst("(= !a1 (c {#0}))")
        Assign(b, [0, 1])
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, [0, 1, 2])
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(a, (0, ))  # make sure it's a tuple with comma
        checkAst("(= !a1 (c {#0}))")
        Assign(b, (0, 1))
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, (0, 1, 2))
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(c, a[0] + b[1])
        checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        Assign(c[0], (a[0] + b[1]))
        checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Exemplo n.º 37
0
def simpleCheckRFView(node=None, rfv=None, checkScoringOnly=False, noPrint=False, **kwargs):
    if not node:
        node = h2o_nodes.nodes[0]

    if 'warnings' in rfv:
        warnings = rfv['warnings']
        # catch the 'Failed to converge" for now
        for w in warnings:
            if not noPrint: print "\nwarning:", w
            if ('Failed' in w) or ('failed' in w):
                raise Exception(w)

    #****************************
    # if we are checking after confusion_matrix for predict, the jsonschema is different

    if 'cm' in rfv:
        cm = rfv['cm'] # only one
    else:
        if 'drf_model' in rfv:
            rf_model = rfv['drf_model']
        elif 'speedrf_model' in rfv:
            rf_model = rfv['speedrf_model']
        elif 'rf_model' in rfv:
            rf_model = rfv['rf_model']
        else:
            raise Exception("no rf_model in rfv? %s" % dump_json(rfv))

        cms = rf_model['cms']
        print "number of cms:", len(cms)
        print "FIX! need to add reporting of h2o's _perr per class error"
        # FIX! what if regression. is rf only classification?
        print "cms[-1]['_arr']:", cms[-1]['_arr']
        print "cms[-1]['_predErr']:", cms[-1]['_predErr']
        print "cms[-1]['_classErr']:", cms[-1]['_classErr']

        ## print "cms[-1]:", dump_json(cms[-1])
        ## for i,c in enumerate(cms):
        ##    print "cm %s: %s" % (i, c['_arr'])

        cm = cms[-1]['_arr'] # take the last one

    scoresList = cm

    if not checkScoringOnly:
        used_trees = rf_model['N']
        errs = rf_model['errs']
        print "errs[0]:", errs[0]
        print "errs[-1]:", errs[-1]
        print "errs:", errs
        # if we got the ntree for comparison. Not always there in kwargs though!
        param_ntrees = kwargs.get('ntrees', None)
        if (param_ntrees is not None and used_trees != param_ntrees):
            raise Exception("used_trees should == param_ntree. used_trees: %s"  % used_trees)
        if (used_trees+1)!=len(cms) or (used_trees+1)!=len(errs):
            raise Exception("len(cms): %s and len(errs): %s should be one more than N %s trees" % (len(cms), len(errs), used_trees))


    #****************************
    totalScores = 0
    totalRight = 0
    # individual scores can be all 0 if nothing for that output class
    # due to sampling
    classErrorPctList = []
    predictedClassDict = {} # may be missing some? so need a dict?
    for classIndex,s in enumerate(scoresList):
        classSum = sum(s)
        if classSum == 0 :
            # why would the number of scores for a class be 0? does RF CM have entries for non-existent classes
            # in a range??..in any case, tolerate. (it shows up in test.py on poker100)
            if not noPrint: print "class:", classIndex, "classSum", classSum, "<- why 0?"
        else:
            # H2O should really give me this since it's in the browser, but it doesn't
            classRightPct = ((s[classIndex] + 0.0)/classSum) * 100
            totalRight += s[classIndex]
            classErrorPct = round(100 - classRightPct, 2)
            classErrorPctList.append(classErrorPct)
            ### print "s:", s, "classIndex:", classIndex
            if not noPrint: print "class:", classIndex, "classSum", classSum, "classErrorPct:", "%4.2f" % classErrorPct

            # gather info for prediction summary
            for pIndex,p in enumerate(s):
                if pIndex not in predictedClassDict:
                    predictedClassDict[pIndex] = p
                else:
                    predictedClassDict[pIndex] += p

        totalScores += classSum

    #****************************
    if not noPrint: 
        print "Predicted summary:"
        # FIX! Not sure why we weren't working with a list..hack with dict for now
        for predictedClass,p in predictedClassDict.items():
            print str(predictedClass)+":", p

        # this should equal the num rows in the dataset if full scoring? (minus any NAs)
        print "totalScores:", totalScores
        print "totalRight:", totalRight
        if totalScores != 0:  
            pctRight = 100.0 * totalRight/totalScores
        else: 
            pctRight = 0.0
        pctWrong = 100 - pctRight
        print "pctRight:", "%5.2f" % pctRight
        print "pctWrong:", "%5.2f" % pctWrong

    if checkScoringOnly:
        check_sandbox_for_errors()
        classification_error = pctWrong
        return (round(classification_error,2), classErrorPctList, totalScores)

    # it's legal to get 0's for oobe error # if sample_rate = 1
    sample_rate = kwargs.get('sample_rate', None)
    validation = kwargs.get('validation', None)
    print "kevin:", sample_rate, validation
    if (sample_rate==1 and not validation): 
        pass
    elif (totalScores<=0 or totalScores>5e9):
        raise Exception("scores in RFView seems wrong. scores:", scoresList)

    varimp = rf_model['varimp']

    if 'importance' in kwargs and kwargs['importance']:
        max_var = varimp['max_var']
        variables = varimp['variables']
        varimpSD = varimp['varimpSD']
        varimp2 = varimp['varimp']

        # what is max_var? it's 100 while the length of the others is 54 for covtype
        if not max_var:
            raise Exception("varimp.max_var is None? %s" % max_var)
        # if not variables:
        #     raise Exception("varimp.variables is None? %s" % variables)
        if not varimpSD:
            raise Exception("varimp.varimpSD is None? %s" % varimpSD)
        if not varimp2:
            raise Exception("varimp.varimp is None? %s" % varimp2)

        # check that they all have the same length and that the importance is not all zero
        # if len(varimpSD)!=max_var or len(varimp2)!=max_var or len(variables)!=max_var:
        #    raise Exception("varimp lists seem to be wrong length: %s %s %s" % \
        #        (max_var, len(varimpSD), len(varimp2), len(variables)))

        # not checking maxvar or variables. Don't know what they should be
        if len(varimpSD) != len(varimp2):
            raise Exception("varimp lists seem to be wrong length: %s %s" % \
                (len(varimpSD), len(varimp2)))

        h2o_util.assertApproxEqual(sum(varimp2), 0.0, tol=1e-5, 
            msg="Shouldn't have all 0's in varimp %s" % varimp2)

    treeStats = rf_model['treeStats']
    if not treeStats:
        raise Exception("treeStats not right?: %s" % dump_json(treeStats))
    # print "json:", dump_json(rfv)
    data_key = rf_model['_dataKey']
    model_key = rf_model['_key']
    classification_error = pctWrong

    if not noPrint: 
        if 'minLeaves' not in treeStats or not treeStats['minLeaves']:
            raise Exception("treeStats seems to be missing minLeaves %s" % dump_json(treeStats))
        print """
         Leaves: {0} / {1} / {2}
          Depth: {3} / {4} / {5}
            Err: {6:0.2f} %
        """.format(
                treeStats['minLeaves'],
                treeStats['meanLeaves'],
                treeStats['maxLeaves'],
                treeStats['minDepth'],
                treeStats['meanDepth'],
                treeStats['maxDepth'],
                classification_error,
                )
    
    ### modelInspect = node.inspect(model_key)
    dataInspect = h2o_cmd.runInspect(key=data_key)
    check_sandbox_for_errors()
    return (round(classification_error,2), classErrorPctList, totalScores)
Exemplo n.º 38
0
def log_view(self, timeoutSecs=10, **kwargs):
    a = self.do_json_request('LogView.json', timeout=timeoutSecs)
    verboseprint("\nlog_view result:", dump_json(a))
    return a
Exemplo n.º 39
0
def parse(self, key, hex_key=None, columnTypeDict=None,
          timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180,
          noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'source_keys': None,
        'destination_key': hex_key, 
        'parse_type': None, # file type 
        'separator': None,
        'single_quotes': None,
        'check_header': None, # forces first line to be seen as column names 
        'number_columns': None,
        'column_names': None, # a list
        'column_types': None, # a list. or can use columnTypeDict param (see below)
        'na_strings' : None, # a list
        'chunk_size': None,
        # are these two no longer supported?
        'delete_on_done': None,
        'blocking': None,
    }
        
    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception("key seems to be bad in parse. Should be list or string. %s" % key)
        # have to put double quotes around the individual list items (single not legal)
        source_keys = "[" + ",".join(map((lambda x: '"' + x + '"'), key)) + "]"

    else:
        # what if None here
        source_keys = '["' + key + '"]' # quotes required on key

    params_dict['source_keys'] = source_keys

    # merge kwargs into params_dict
    # =None overwrites params_dict

    # columnTypeDict not used here
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False)
    # Call ParseSetup?source_keys=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'source_keys': source_keys}
    setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # this should match what we gave as input?
    if setup_result['source_keys']:
        # should these be quoted?
        source_keysStr = "[" + ",".join([('"%s"' % src['name']) for src in setup_result['source_keys'] ]) + "]"
    else:
        source_keysStr = None

    # I suppose we need a way for parameters to parse() to override these
    # should it be an array or a dict?
    if setup_result['column_names']:
        # single quotes not legal..need double quotes
        columnNamesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]"
    else:
        columnNamesStr = None

    columnTypes = setup_result['column_types']
    assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes)

    if setup_result['na_strings']:
        # single quotes not legal..need double quotes
        naStrings = "[" + ",".join(map((lambda x: '"' + x + '"' if x != None else '""'), setup_result['na_strings'])) + "]"
    else:
        naStrings = None

    # dict parameter to update columnTypeDict?
    # but we don't pass columnNames like this?
    ct = setup_result['column_types']
    if columnTypeDict: 
        for k,v in columnTypeDict.iteritems():
            if isinstance(k, int):
                # if a column index
                if k>=0 and k<len(ct):
                    ct[k] = v
                else:
                    raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict))
            # if a column name
            elif isinstance(k, basestring):
                # find the index
                if k not in columnNames:
                    raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames))
                ci = columnNames.index(k)
                ct[ci] = v
            else:
                raise Exception("%s %s should be int or string" % (k, type(k)))

    columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]"


    parse_params = {
        'source_keys': source_keysStr,
        'destination_key': setup_result['destination_key'],
        'parse_type': setup_result['parse_type'],
        'separator': setup_result['separator'],
        'single_quotes': setup_result['single_quotes'],
        'check_header': setup_result['check_header'],
        'number_columns': setup_result['number_columns'],
        'column_names': columnNamesStr,
        'column_types': columnTypesStr,
        'na_strings': naStrings, 
        'chunk_size': setup_result['chunk_size'],
        # No longer supported? how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here. 
    tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") 
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', 
        print_params=not tooManyColNamesToPrint, ignoreNone=True)

    print "parse source_keys is length:", len(parse_params['source_keys'])
    # This can be null now? parseSetup doesn't return default colnames?
    # print "parse column_names is length:", len(parse_params['column_names'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request( jsonRequest="3/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['destination_key']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status=='FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
Exemplo n.º 40
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data")
        csvPathname = find_file("smalldata/logreg/prostate.csv")
        import_result = h2o.n0.import_files(path=csvPathname)
        # print dump_json(import_result)

        k = import_result['destination_frames'][0]
        frames_result = h2o.n0.frames(key=k)

        frame = frames_result['frames'][0]
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing_count']
            stype = c['type']
            domain = c['domain']

        # print dump_json(frame)

        # let's see what ray's util does
        frames = h2o.n0.frames()['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'frame_id/name')
        # print "frames:", dump_json(frames)
        # print "frames_dict:", dump_json(frames_dict)
        for k, v in frames_dict.items():
            print "frames_dict key:", k

        # interesting. we can do dictionary comprehensions
        # { k:v for k,v in my_dict.items() if 'Peter' in k }

        # how do you parse multiple files
        parse_result = h2o.n0.parse(
            key=k, intermediateResults=DO_INTERMEDIATE_RESULTS)

        frame = parse_result['frames'][0]
        hex_key = frame['frame_id']['name']

        colCount = 9
        rowCount = 380
        # colCount = 11
        # rowCount = 1000000
        start = time.time()
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "Inspect:", hex_key, "took", time.time() - start, "seconds"
        numCols = len(inspect['frames'][0]['columns'])
        numRows = inspect['frames'][0]['rows']
        print "\n" + csvPathname, \
            "    rows:", "{:,}".format(numRows), \
            "    len(columns):", "{:,}".format(numCols)

        # should match # of cols in header or ??
        self.assertEqual(
            numCols, colCount,
            "parse created result with the wrong number of cols %s %s" %
            (numCols, colCount))
        self.assertEqual(numRows, rowCount,
            "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
            (numRows, rowCount))

        verboseprint(hex_key, ":", dump_json(parse_result))
Exemplo n.º 41
0
 def test_b_algo_parameters(self):
     for algo in ['kmeans', 'gbm', 'deeplearning', 'glm', 'word2vec', 'example', 'quantile', 'grep']:
         paramResult = h2o.n0.model_builders(algo=algo)
         self.print_params(paramResult)
         mmResult = h2o.n0.model_metrics(algo=algo)
         print "mmResult", dump_json(mmResult)
Exemplo n.º 42
0
    def test_xl_real(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexDF = 'v'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexDF)

        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1')  # knon_* key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)
        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        # look at our secret stash in the base class. Should see the DFInit?
        print "Does the lastExecResult stash work?", dump_json(
            h2o_xl.Xbase.lastExecResult)
        # this should work if str(DF) returns DF.frame
        inspect = h2o_cmd.runInspect(key=a)
        # print "inspect a", dump_json(inspect)

        b = DF('b1')
        assert isinstance(b, DF)
        inspect = h2o_cmd.runInspect(key=b)
        # print "inspect b", dump_json(inspect)

        Assign(a, [0.0, 1.0, 2.0])
        assert isinstance(a, Key)
        b <<= [3.1, 4.1, 5.1]
        assert isinstance(b, Key)
        # FIX! how come I have to create c here first for python
        # see here
        # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python
        # is it too much to require c to exist first?
        # c = DF()
        # c <<= a + b

        # this will trigger ok?
        c = DF('c1')
        c <<= [6.2, 7.2, 8.2]
        assert isinstance(c, Key)
        # c[0] <<= a + b
        # Assign(lhs=c[0], rhs=(a + b))
        rhs = a + b
        Assign(c, rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= !c1 (+ %a1 %b1))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        rhs = a[0] + b[0]
        Assign(c[0], rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #0 #0)))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        Assign(c[1], (a[2] + b[2]))
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        # assert ast = "(= !b1 (is.na (c {#0})))"

        assert isinstance(c, Key), type(c)

        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        # DF inits the frame
        # if you just want an existing Key, say existing=True
        a = DF('a2')  # named data frame
        assert isinstance(a, DF)
        b = DF('b2')
        c = DF('c2')
        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        a <<= 3
        b <<= 3
        c <<= 3
        c[0] <<= a[0] + b[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        a = DF('a3')  # named data frame
        b = DF('b3')
        c = DF('c3')
        a <<= 4
        b <<= 4
        c <<= 4

        c[0] <<= a[0] - b[0]
        assert isinstance(c, Key)
        c[0] <<= a[0] * b[0]
        assert isinstance(c, Key)

        a = DF('a4')  # named data frame
        b = DF('b4')
        c = DF('c4')
        a <<= 5
        b <<= 5
        c <<= 5
        c[0] <<= (a[0] - b[0])
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        c[0] <<= (a[0] & b[0]) | a[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Exemplo n.º 43
0
def build_model(self, algo, training_frame, parameters, destination_key=None, 
    timeoutSecs=60, noPoll=False, **kwargs):
    '''
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    '''
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders['model_builders'], "%s %s" % (algo, [k for k in model_builders['model_builders']])
    builder = model_builders['model_builders'][algo]
    
    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(training_frame)

    key_name = frames['frames'][0]['key']['name'] 
    assert key_name==training_frame, \
        "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame)
    parameters['training_frame'] = training_frame

    if destination_key is not None:
        parameters['destination_key'] = destination_key

    print "build_model parameters", parameters
    start = time.time()
    result1 = self.do_json_request('/3/ModelBuilders.json/' + algo, cmd='post', 
        timeout=timeoutSecs, postData=parameters)
    # make get overwritten after polling
    elapsed = time.time() - start
    verboseprint("build_model result", dump_json(result1))
      
    if noPoll:
        result = result1
    elif 'validation_error_count' in result1:
        h2p.yellow_print("parameter error in model_builders: %s")
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
        # don't bother printing a time message
    elif 'exception_msg' in result1:
        h2p.yellow_print("exception msg in model_builders: %s" % result1['exception_msg'])
        result = result1
    else:
        job_result = result1['jobs'][0]
        job_key = job_result['key']['name']
        verboseprint("build_model job_key: " + repr(job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        elapsed = time.time() - start
        print "ModelBuilders", algo,  "end on", training_frame, 'took', time.time() - start, 'seconds'
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        if job_result:
            jobs = job_result['jobs'][0]
            description = jobs['description']
            dest = jobs['dest']
            msec = jobs['msec']
            status = jobs['status']
            progress = jobs['progress']

            # can condition this with a parameter if some FAILED are expected by tests.
            if status=='FAILED':
                print dump_json(job_result)
                raise Exception("Taking exception on build_model job status: %s %s %s %s" % \
                    (status, progress, msec, description))

            result = job_result
        else:
            # ? we should always get a job_json result
            raise Exception("build_model didn't get a job_result when it expected one")
            # return None

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    result['python_elapsed'] = elapsed
    return result
Exemplo n.º 44
0
def verify_cloud_size(nodeList=None, expectedCloudName=None, expectedLocked=None, verbose=False,
    timeoutSecs=10, ignoreHealth=True):

    if not nodeList: nodeList = h2o_nodes.nodes
    expectedSize = len(nodeList)
    # cloud size and consensus have to reflect a single grab of information from a node.
    cloudStatus = [n.get_cloud(timeoutSecs=timeoutSecs) for n in nodeList]

    cloudSizes = [(c['cloud_size']) for c in cloudStatus]
    cloudConsensus = [c['consensus'] for c in cloudStatus]
    cloudName = [c['cloud_name'] for c in cloudStatus]
    cloudLocked = [c['locked'] for c in cloudStatus]
    cloudVersion = [c['version'] for c in cloudStatus]

    # all match 0?
    # if "(unknown)" starts appearing in version..go to h2o1 h2o_bc.py/h2o_fc.py/h2o_methods.py and copy allowing.
    expectedVersion = cloudVersion[0]
    # check to see if it's a h2o-dev version? (common problem when mixing h2o1/h2o-dev testing with --usecloud
    # local builds have (unknown) in h2o if you build.sh (instead of make)
    # gradle builds should always be right with version?
    if not expectedVersion.startswith('0'):
        raise Exception("h2o version at node[0] doesn't look like h2o-dev version. (start with 0) %s" % 
            expectedVersion)

    for i, v in enumerate(cloudVersion):
        if v != expectedVersion:
            versionStr = (",".join(map(str, cloudVersion)))
            raise Exception("node %s. Inconsistent cloud version. nodeList report version: %s" % 
                (i, versionStr))

    if not ignoreHealth:
        for c in cloudStatus:
            if 'cloud_healthy' not in c:
                raise Exception("cloud_healthy missing: %s" % dump_json(c))

        cloudHealthy = [c['cloud_healthy'] for c in cloudStatus]
        if not all(cloudHealthy):
            msg = "Some node reported cloud_healthy not true: %s" % cloudHealthy
            raise Exception(msg)

    # gather up all the node_healthy status too
    for i, c in enumerate(cloudStatus):
        nodesHealthy = [n['healthy'] for n in c['nodes']]
        if not all(nodesHealthy):
            print "node %s cloud status: %s" % (i, dump_json(c))
            msg = "node %s says some node is not reporting node_healthy: %s" % (c['cloud_name'], nodesHealthy)
            if not ignoreHealth:
                raise Exception(msg)

    if expectedSize == 0 or len(cloudSizes) == 0 or len(cloudConsensus) == 0:
        print "\nexpectedSize:", expectedSize
        print "cloudSizes:", cloudSizes
        print "cloudConsensus:", cloudConsensus
        raise Exception("Nothing in cloud. Can't verify size")

    consensusStr = (",".join(map(str, cloudConsensus)))
    sizeStr = (",".join(map(str, cloudSizes)))
    for s in cloudSizes:
        if s != expectedSize:
            raise Exception("Inconsistent cloud size. nodeList report size: %s consensus: %s instead of %d." %
               (sizeStr, consensusStr, expectedSize))

    # check that all cloud_names are right
    if expectedCloudName:
        for i, cn in enumerate(cloudName):
            if cn != expectedCloudName:
                print "node %s has the wrong cloud name: %s expectedCloudName: %s." % (i, cn, expectedCloudName)
                # print "node %s cloud status: %s" % (i, dump_json(cloudStatus[i]))
                # tear everyone down, in case of zombies. so we don't have to kill -9 manually
                print "tearing cloud down"
                tear_down_cloud(nodeList=nodeList, sandboxIgnoreErrors=False)
                raise Exception("node %s has the wrong cloud name: %s expectedCloudName: %s" % \
                    (i, cn, expectedCloudName))

    # check that all locked are right
    if expectedLocked:
        for i, cl in enumerate(cloudLocked):
            if cl != expectedLocked:
                print "node %s has the wrong locked: %s expectedLocked: %s." % (i, cl, expectedLocked)
                # print "node %s cloud status: %s" % (i, dump_json(cloudStatus[i]))
                # tear everyone down, in case of zombies. so we don't have to kill -9 manually
                print "tearing cloud down"
                tear_down_cloud(nodeList=nodeList, sandboxIgnoreErrors=False)
                raise Exception("node %s has the wrong locked: %s expectedLocked: %s" % (i, cn, expectedLocked))

    return (sizeStr, consensusStr, expectedSize)