예제 #1
0
파일: h2o_rf.py 프로젝트: JerryZhong/h2o
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, 
    csvSrcOutputPathname, csvPredictPathname, 
    skipSrcOutputHeader, skipPredictHeader,
    translate=None, y=0):
    # have to slice out col 0 (the output) and feed result to predict
    # cols are 0:784 (1 output plus 784 input features
    # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
    dataKey = "P.hex"
    h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
    if skipSrcOutputHeader:
        print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
        print "hack for now, can't chop out col 0 in Exec currently"
        dataKey = hex_key
    else:
        print "No header in dataset, can't chop out cols, since col numbers are used for names"
        dataKey = hex_key

    # +1 col index because R-like
    h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

    start = time.time()
    predict = h2o_nodes.nodes[0].generate_predictions(model_key=model_key,
        data_key=hex_key, destination_key=predictHexKey)
    print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds'
    check_sandbox_for_errors()
    inspect = h2o_cmd.runInspect(key=predictHexKey)
    h2o_cmd.infoFromInspect(inspect, 'predict.hex')

    h2o_nodes.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
    h2o_nodes.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
    check_sandbox_for_errors()

    print "Do a check of the original output col against predicted output"
    (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
        msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
    (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
        msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

    # no header on source
    if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
        raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
            %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

    wrong = 0
    for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
        # if float(o)!=float(p):
        if str(o)!=str(p):
            if wrong==10:
                print "Not printing any more mismatches\n"
            elif wrong<10:
                msg = "Comparing original output col vs predicted. row %s differs. \
                    original: %s predicted: %s"  % (rowNum, o, p)
                print msg
            wrong += 1

    print "\nTotal wrong:", wrong
    print "Total:", len(originalOutput)
    pctWrong = (100.0 * wrong)/len(originalOutput)
    print "wrong/Total * 100 ", pctWrong
    return pctWrong
예제 #2
0
def simpleCheckKMeans(self, kmeans, **kwargs):
    warnings = None
    if 'warnings' in kmeans:
        warnings = kmeans['warnings']
        # catch the 'Failed to converge" for now
        x = re.compile("[Ff]ailed")
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w): raise Exception(w)

    # Check other things in the json response dictionary 'kmeans' here
    destination_key = kmeans['model']['_key']
    # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame

    # can't use inspect on a model key? now?
    kmeansResult = kmeans

    model = kmeansResult['model']
    centers = model["centers"]
    size = model["size"]
    cluster_variances = model["within_cluster_variances"]
    error = model["total_within_SS"]
    iterations = model["iterations"]
    normalized = model["normalized"]
    max_iter = model["max_iter"]

    for i,c in enumerate(centers):
        for n in c:
            if math.isnan(float(n)):
                raise Exception("center", i, "has NaN:", n, "center:", c)

    # shouldn't have any errors
    check_sandbox_for_errors()

    return warnings
예제 #3
0
    def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False,
        cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs):
        # if url param is used, use it as full url. otherwise create from the jsonRequest
        if fullUrl:
            url = fullUrl
        else:
            url = self.url(jsonRequest)

        # remove any params that are 'None'
        # need to copy dictionary, since can't delete while iterating
        if params is not None:
            params2 = params.copy()
            for k in params2:
                if params2[k] is None:
                    del params[k]
            paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()])
        else:
            paramsStr = ''

        extraComment2 = " " + str(postData)+";" if cmd=='post' else ""
        extraComment2 += extraComment if extraComment else ""

        if len(extraComment2) > 0:
            log('Start ' + url + paramsStr, comment=extraComment2)
        else:
            log('Start ' + url + paramsStr)

        # file get passed thru kwargs here
        if h2o_args.no_timeout:
            timeout = None # infinite
        try:
            if 'post' == cmd:
                # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST.
                # This is temporary.
                # 
                # This following does application/json (aka, posting JSON in the body):
                # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(postData), **kwargs)
                # 
                # This does form-encoded, which doesn't allow POST of nested structures
                r = requests.post(url, timeout=timeout, params=params, data=postData, **kwargs)
            elif 'delete' == cmd:
                r = requests.delete(url, timeout=timeout, params=params, **kwargs)
            elif 'get' == cmd:
                r = requests.get(url, timeout=timeout, params=params, **kwargs)
            else:
                raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd)

        except Exception, e:
            # rethrow the exception after we've checked for stack trace from h2o
            # out of memory errors maybe don't show up right away? so we should wait for h2o
            # to get it out to h2o stdout. We don't want to rely on cloud teardown to check
            # because there's no delay, and we don't want to delay all cloud teardowns by waiting.
            exc_info = sys.exc_info()
            # use this to ignore the initial connection errors during build cloud when h2o is coming up
            if not noExtraErrorCheck: 
                h2p.red_print(
                    "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr))
                time.sleep(2)
                check_sandbox_for_errors(python_test_name=h2o_args.python_test_name);
            raise exc_info[1], None, exc_info[2]
예제 #4
0
    def stabilize(self, test_func, error, timeoutSecs=10, retryDelaySecs=0.5):
        '''Repeatedly test a function waiting for it to return True.

        Arguments:
        test_func      -- A function that will be run repeatedly
        error          -- A function that will be run to produce an error message
                          it will be called with (node, timeTakenSecs, numberOfRetries)
                    OR
                       -- A string that will be interpolated with a dictionary of
                          { 'timeTakenSecs', 'numberOfRetries' }
        timeoutSecs    -- How long in seconds to keep trying before declaring a failure
        retryDelaySecs -- How long to wait between retry attempts
        '''
        start = time.time()
        numberOfRetries = 0
        while h2o_args.no_timeout or (time.time() - start < timeoutSecs):
            if test_func(self, tries=numberOfRetries, timeoutSecs=timeoutSecs):
                break
            time.sleep(retryDelaySecs)
            numberOfRetries += 1
            # hey, check the sandbox if we've been waiting a long time...rather than wait for timeout
            # to find the badness?. can check_sandbox_for_errors at any time
            if ((numberOfRetries % 50) == 0):
                check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        else:
            timeTakenSecs = time.time() - start
            if isinstance(error, type('')):
                raise Exception('%s failed after %.2f seconds having retried %d times' % (
                    error, timeTakenSecs, numberOfRetries))
            else:
                msg = error(self, timeTakenSecs, numberOfRetries)
                raise Exception(msg)
예제 #5
0
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, 
    csvSrcOutputPathname, csvPredictPathname, 
    skipSrcOutputHeader, skipPredictHeader,
    translate=None, y=0):
    # have to slice out col 0 (the output) and feed result to predict
    # cols are 0:784 (1 output plus 784 input features
    # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
    dataKey = "P.hex"
    h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
    if skipSrcOutputHeader:
        print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
        print "hack for now, can't chop out col 0 in Exec currently"
        dataKey = hex_key
    else:
        print "No header in dataset, can't chop out cols, since col numbers are used for names"
        dataKey = hex_key

    # +1 col index because R-like
    h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

    start = time.time()
    predict = h2o_nodes.nodes[0].generate_predictions(model_key=model_key,
        data_key=hex_key, destination_key=predictHexKey)
    print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds'
    check_sandbox_for_errors()
    inspect = h2o_cmd.runInspect(key=predictHexKey)
    h2o_cmd.infoFromInspect(inspect, 'predict.hex')

    h2o_nodes.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
    h2o_nodes.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
    check_sandbox_for_errors()

    print "Do a check of the original output col against predicted output"
    (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
        msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
    (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
        msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

    # no header on source
    if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
        raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
            %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

    wrong = 0
    for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
        # if float(o)!=float(p):
        if str(o)!=str(p):
            if wrong==10:
                print "Not printing any more mismatches\n"
            elif wrong<10:
                msg = "Comparing original output col vs predicted. row %s differs. \
                    original: %s predicted: %s"  % (rowNum, o, p)
                print msg
            wrong += 1

    print "\nTotal wrong:", wrong
    print "Total:", len(originalOutput)
    pctWrong = (100.0 * wrong)/len(originalOutput)
    print "wrong/Total * 100 ", pctWrong
    return pctWrong
예제 #6
0
파일: h2o_glm.py 프로젝트: JMR-b/h2o-dev
def simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False):

    warnings = ''

    intercept = model.global_beta[-1]
    interceptName = model.coefficient_names[-1]

    coeffs = model.global_beta[:-1]
    coeffs_names = model.coefficient_names[:-1]

    assert len(coeffs) == (len(model.coefficient_names)-1)
    assert len(coeffs) == len(labelListUsed), "%s %s" % (coeffs, labelListUsed)
    
    # labelList still has the response column?
    # ignored columns aren't in model.names, but output response is.
    # labelListUsed has the response col removed so add 1
    assert len(model.names) == (len(labelListUsed)+1), "%s %s" % (model.names, labelList)
    assert model.threshold!=0

    print "len(coeffs)", len(coeffs)
    print  "coeffs:", coeffs

    # last one is intercept
    if interceptName != "Intercept" or abs(intercept)<1e-26:
        raise Exception("'Intercept' should be last in coefficient_names and global_beta %s %s" % (interceptName, intercept))

    y = parameters['response_column']

    cString = "\n"
    for i,c in enumerate(coeffs_names):
        cString += "%s: %.5e   " % (coeffs_names[i], coeffs[i])

    print cString
    print "\nH2O intercept:\t\t%.5e" % intercept
    print "\nTotal # of coeffs:", len(coeffs_names)

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coeffs['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" +
        "parameters:" + dump_json(parameters)
        ))

    if (not allowZeroCoeff) and (len(coeffs)>1):
        s = 0.0
        for c in coeffs:
            s += abs(float(c))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" +
            "parameters:" + dump_json(parameters)
            ))

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, coeffs, intercept)
예제 #7
0
    def test(n, tries=None, timeoutSecs=14.0):
        c = n.get_cloud(noExtraErrorCheck=noExtraErrorCheck, timeoutSecs=timeoutSecs)

        # FIX! unique to h2o-dev for now, because of the port reuse problems (TCP_WAIT) compared to h2o
        # flag them early rather than after timeout
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        # don't want to check everything. But this will check that the keys are returned!
        consensus = c['consensus']
        locked = c['locked']
        cloud_size = c['cloud_size']
        cloud_name = c['cloud_name']

        if 'nodes' not in c:
            emsg = "\nH2O didn't include a list of nodes in get_cloud response after initial cloud build"
            raise Exception(emsg)

        # only print it when you get consensus
        if cloud_size != node_count:
            print "\nNodes in cloud while building:"
            for i,ci in enumerate(c['nodes']):
                # 'h2o' disappeared?
                if 'h2o' not in ci:
                    print "ci:", dump_json(ci)
                    # apparently this can happen in cases where I didn't join a cloud because 
                    # of a different md5 version. We'll eventually exception out?
                    # raise Exception("What happened to the 'h2o' ci dict entry?, not there")
                else:
                    print "node %s" % i, ci['h2o']
                    ### print "node %s" % i, ci['h2o']['node']

        if cloud_size > node_count:
            emsg = (
                "\n\nERROR: cloud_size: %d reported via json is bigger than we expect: %d" % \
                    (cloud_size, node_count) +
                "\nLikely have zombie(s) with the same cloud name on the network." +
                "\nLook at the cloud IP's in 'grep Paxos sandbox/*stdout*' for some IP's you didn't expect." +
                "\n\nYou probably don't have to do anything, as the cloud shutdown in this test should" +
                "\nhave sent a Shutdown.json to all in that cloud (you'll see a kill -2 in the *stdout*)." +
                "\nIf you try again, and it still fails, go to those IPs and kill the zombie h2o's." +
                "\nIf you think you really have an intermittent cloud build, report it." +
                "\n" +
                "\nbuilding cloud size of 2 with 127.0.0.1 may temporarily report 3 incorrectly," +
                "\nwith no zombie?"
            )
            for ci in c['nodes']:
                emsg += "\n" + ci['h2o']['node']
            raise Exception(emsg)

        a = (cloud_size == node_count) and consensus
        if a:
            verboseprint("\tLocked won't happen until after keys are written")
            verboseprint("\nNodes in final cloud:")
            for ci in c['nodes']:
                verboseprint("ci", ci)
                # this isn't in there all the time?
                # verboseprint(ci['h2o']['node'])

        return a
예제 #8
0
파일: h2o_bc.py 프로젝트: RogerKamena/h2o-3
    def test(n, tries=None, timeoutSecs=14.0):
        c = n.get_cloud(noExtraErrorCheck=noExtraErrorCheck, timeoutSecs=timeoutSecs)

        # FIX! unique to h2o-dev for now, because of the port reuse problems (TCP_WAIT) compared to h2o
        # flag them early rather than after timeout
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        # don't want to check everything. But this will check that the keys are returned!
        consensus = c["consensus"]
        locked = c["locked"]
        cloud_size = c["cloud_size"]
        cloud_name = c["cloud_name"]

        if "nodes" not in c:
            emsg = "\nH2O didn't include a list of nodes in get_cloud response after initial cloud build"
            raise Exception(emsg)

        # only print it when you get consensus
        if cloud_size != node_count:
            print "\nNodes in cloud while building:"
            for i, ci in enumerate(c["nodes"]):
                # 'h2o' disappeared?
                if "h2o" not in ci:
                    print "ci:", dump_json(ci)
                    # apparently this can happen in cases where I didn't join a cloud because
                    # of a different md5 version. We'll eventually exception out?
                    # raise Exception("What happened to the 'h2o' ci dict entry?, not there")
                else:
                    print "node %s" % i, ci["h2o"]
                    ### print "node %s" % i, ci['h2o']['node']

        if cloud_size > node_count:
            emsg = (
                "\n\nERROR: cloud_size: %d reported via json is bigger than we expect: %d" % (cloud_size, node_count)
                + "\nLikely have zombie(s) with the same cloud name on the network."
                + "\nLook at the cloud IP's in 'grep Paxos sandbox/*stdout*' for some IP's you didn't expect."
                + "\n\nYou probably don't have to do anything, as the cloud shutdown in this test should"
                + "\nhave sent a Shutdown.json to all in that cloud (you'll see a kill -2 in the *stdout*)."
                + "\nIf you try again, and it still fails, go to those IPs and kill the zombie h2o's."
                + "\nIf you think you really have an intermittent cloud build, report it."
                + "\n"
                + "\nbuilding cloud size of 2 with 127.0.0.1 may temporarily report 3 incorrectly,"
                + "\nwith no zombie?"
            )
            for ci in c["nodes"]:
                emsg += "\n" + ci["h2o"]["node"]
            raise Exception(emsg)

        a = (cloud_size == node_count) and consensus
        if a:
            verboseprint("\tLocked won't happen until after keys are written")
            verboseprint("\nNodes in final cloud:")
            for ci in c["nodes"]:
                verboseprint("ci", ci)
                # this isn't in there all the time?
                # verboseprint(ci['h2o']['node'])

        return a
예제 #9
0
파일: h2o_bc.py 프로젝트: RogerKamena/h2o-3
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False):
    if h2o_args.sleep_at_tear_down:
        print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)"
        import h2o_browse as h2b

        h2b.browseTheCloud()
        sleep(3600)

    if not nodeList:
        nodeList = h2o_nodes.nodes

    # this could fail too. Should this be set by -uc/--usecloud? or command line argument
    if nodeList and nodeList[0].delete_keys_at_teardown:
        start = time.time()
        h2i.delete_keys_at_all_nodes(timeoutSecs=300)
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"

    # could the nodeList still be empty in some exception cases? Assume not for now

    # FIX! don't send shutdown if we're using an existing cloud
    # also, copy the "delete keys at teardown from testdir_release
    # Assume there's a last "test" that's run to shutdown the cloud

    # don't tear down with -ccj either
    # FIX! what about usecloud or cloud_cloud_json params from build_cloud time?
    if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json):
        try:
            # update: send a shutdown to all nodes.
            # h2o maybe doesn't progagate well if sent to one node
            # the api watchdog shouldn't complain about this?
            # just send one?

            # for n in nodeList:
            #     n.shutdown_all()
            h2o_nodes.nodes[0].shutdown_all()
        except:
            pass

        # ah subtle. we might get excepts in issuing the shutdown, don't abort out
        # of trying the process kills if we get any shutdown exception (remember we go to all nodes)
        # so we might? nodes are shutting down?
        # FIX! should we wait a bit for a clean shutdown, before we process kill?
        # It can take more than 1 sec though.
        try:
            time.sleep(2)
            for n in nodeList:
                n.terminate()
                verboseprint("tear_down_cloud n:", n)
        except:
            pass

    check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name)
    # get rid of all those pesky line marker files. Unneeded now
    clean_sandbox_doneToLine()
    nodeList[:] = []
    h2o_nodes.nodes = []
예제 #10
0
    def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, returnFast=False,
        cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs):
        # if url param is used, use it as full url. otherwise crate from the jsonRequest
        if fullUrl:
            url = fullUrl
        else:
            url = self.url(jsonRequest)

        # remove any params that are 'None'
        # need to copy dictionary, since can't delete while iterating
        if params is not None:
            params2 = params.copy()
            for k in params2:
                if params2[k] is None:
                    del params[k]
            paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()])
        else:
            paramsStr = ''

        if extraComment:
            log('Start ' + url + paramsStr, comment=extraComment)
        else:
            log('Start ' + url + paramsStr)

        log_rest("")
        log_rest("----------------------------------------------------------------------\n")
        if extraComment:
            log_rest("# Extra comment info about this request: " + extraComment)
        if cmd == 'get':
            log_rest("GET")
        else:
            log_rest("POST")
        log_rest(url + paramsStr)

        # file get passed thru kwargs here
        try:
            if cmd == 'post':
                r = requests.post(url, timeout=timeout, params=params, **kwargs)
            else:
                r = requests.get(url, timeout=timeout, params=params, **kwargs)

        except Exception, e:
            # rethrow the exception after we've checked for stack trace from h2o
            # out of memory errors maybe don't show up right away? so we should wait for h2o
            # to get it out to h2o stdout. We don't want to rely on cloud teardown to check
            # because there's no delay, and we don't want to delay all cloud teardowns by waiting.
            # (this is new/experimental)
            exc_info = sys.exc_info()
            # use this to ignore the initial connection errors during build cloud when h2o is coming up
            if not noExtraErrorCheck: 
                h2p.red_print(
                    "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr))
                time.sleep(2)
                check_sandbox_for_errors(python_test_name=h2o_args.python_test_name);
            log_rest("")
            log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message))
            raise exc_info[1], None, exc_info[2]
예제 #11
0
def import_parse(node=None,
                 schema='local',
                 bucket=None,
                 path=None,
                 src_key=None,
                 hex_key=None,
                 timeoutSecs=30,
                 retryDelaySecs=0.1,
                 initialDelaySecs=0,
                 pollTimeoutSecs=180,
                 noise=None,
                 benchmarkLogging=None,
                 noPoll=False,
                 doSummary=True,
                 noPrint=True,
                 importParentDir=True,
                 **kwargs):

    if not node: node = h2o_nodes.nodes[0]

    (importResult,
     importPattern) = import_only(node, schema, bucket, path, timeoutSecs,
                                  retryDelaySecs, initialDelaySecs,
                                  pollTimeoutSecs, noise, benchmarkLogging,
                                  noPoll, doSummary, src_key, noPrint,
                                  importParentDir, **kwargs)

    verboseprint("importPattern:", importPattern)
    verboseprint("importResult", dump_json(importResult))

    parseResult = parse_only(node, importPattern, hex_key, timeoutSecs,
                             retryDelaySecs, initialDelaySecs, pollTimeoutSecs,
                             noise, benchmarkLogging, noPoll, **kwargs)
    verboseprint("parseResult:", dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    # only if not noPoll. otherwise parse isn't done
    if doSummary and not noPoll:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        check_sandbox_for_errors()
        inspect = node.inspect(parseResult['destination_key'],
                               timeoutSecs=timeoutSecs)
        numRows = inspect['numRows']
        numCols = inspect['numCols']
        # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma)
        node.summary_page(parseResult['destination_key'],
                          timeoutSecs=timeoutSecs,
                          noPrint=noPrint,
                          numRows=numRows,
                          numCols=numCols)
        # for now, don't worry about error isolating summary
    else:
        # isolate a parse from the next thing
        check_sandbox_for_errors()

    return parseResult
예제 #12
0
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False):
    if h2o_args.sleep_at_tear_down:
        print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)"
        import h2o_browse as h2b
        h2b.browseTheCloud()
        sleep(3600)

    if not nodeList: nodeList = h2o_nodes.nodes

    # this could fail too. Should this be set by -uc/--usecloud? or command line argument
    if nodeList and nodeList[0].delete_keys_at_teardown:
        start = time.time()
        h2i.delete_keys_at_all_nodes(timeoutSecs=300)
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"

    # could the nodeList still be empty in some exception cases? Assume not for now

    # FIX! don't send shutdown if we're using an existing cloud
    # also, copy the "delete keys at teardown from testdir_release
    # Assume there's a last "test" that's run to shutdown the cloud

    # don't tear down with -ccj either
    # FIX! what about usecloud or cloud_cloud_json params from build_cloud time?
    if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json):
        try:
            # update: send a shutdown to all nodes. 
            # h2o maybe doesn't progagate well if sent to one node
            # the api watchdog shouldn't complain about this?
            # just send one?

            # for n in nodeList:
            #     n.shutdown_all()
            h2o_nodes.nodes[0].shutdown_all()
        except:
            pass

        # ah subtle. we might get excepts in issuing the shutdown, don't abort out
        # of trying the process kills if we get any shutdown exception (remember we go to all nodes)
        # so we might? nodes are shutting down?
        # FIX! should we wait a bit for a clean shutdown, before we process kill?
        # It can take more than 1 sec though.
        try:
            time.sleep(2)
            for n in nodeList:
                n.terminate()
                verboseprint("tear_down_cloud n:", n)
        except:
            pass

    check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name)
    # get rid of all those pesky line marker files. Unneeded now
    clean_sandbox_doneToLine()
    nodeList[:] = []
    h2o_nodes.nodes = []
예제 #13
0
파일: h2o_nn.py 프로젝트: 100star/h2o
def checkScoreResult(self, result, expectedErr, relTol, **kwargs):
    print "Expected score error: " + format(expectedErr)
    print "Actual   score error: " + format(result['classification_error'])
    if result['classification_error'] != expectedErr and abs((expectedErr - result['classification_error'])/expectedErr) > relTol:
        raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (result['classification_error'], float(relTol)*100, expectedErr))

    warnings = None

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings)
예제 #14
0
def import_parse(node=None, schema='local', bucket=None, path=None,
    src_key=None, hex_key=None, 
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, 
    importParentDir=True, **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.
    # if schema=='put':
    #    h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +  
    #        "\nMeans multi-machine with 'put' will fail")
    #    schema = 'local'

    if not node: node = h2o_nodes.nodes[0]
    (importResult, importPattern) = import_only(node, schema, bucket, path,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs)

    verboseprint("importPattern:", importPattern)
    verboseprint("importResult", dump_json(importResult))

    assert len(importResult['keys']) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path)
    # print "importResult:", importResult

    # get rid of parse timing in tests now
    start = time.time()
    parseResult = parse_only(node, importPattern, hex_key, importResult['keys'],
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, **kwargs)
    elapsed = time.time() - start
    print importPattern, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
    parseResult['python_elapsed'] = elapsed

    verboseprint("parseResult:", dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    # only if not noPoll. otherwise parse isn't done
    if doSummary and not noPoll:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        check_sandbox_for_errors()
        print "WARNING: not doing inspect/summary for now after parse"
        ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs)
        ## numRows = inspect['numRows']
        ## numCols = inspect['numCols']
        # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma)
        ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols)
        # for now, don't worry about error isolating summary 
    else:
        # isolate a parse from the next thing
        check_sandbox_for_errors()

    return parseResult
예제 #15
0
def checkScoreResult(self, result, expectedErr, relTol, **kwargs):
    print "Expected score error: " + format(expectedErr)
    print "Actual   score error: " + format(result['classification_error'])
    if result['classification_error'] != expectedErr and abs(
        (expectedErr - result['classification_error']) / expectedErr) > relTol:
        raise Exception(
            "Scored classification error of %s is not within %s %% relative error of %s"
            %
            (result['classification_error'], float(relTol) * 100, expectedErr))

    warnings = None

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings)
예제 #16
0
def checkH2OLogs(timeoutSecs=3, expectedMinLines=12, suffix="-1-trace"):
    # download logs from node 0 (this will overwrite)
    h2o_nodes.nodes[0].log_download(timeoutSecs=timeoutSecs)

    # I guess we really don't need to get the list of nodes names from get_cloud any more
    # h2o_172.16.2.222_54321-1-trace.log
    # h2o_172.16.2.222_54321-2-debug.log
    # h2o_172.16.2.222_54321-3-info.log
    # h2o_172.16.2.222_54321-4-warn.log
    # h2o_172.16.2.222_54321-5-error.log
    # h2o_172.16.2.222_54321-6-fatal.log
    def checkit(suffix, expectedMinLines):
        logNameList = [
            "h2o_" + str(n.http_addr) + "_" + str(n.port) + suffix + ".log"
            for n in h2o_nodes.nodes
        ]
        lineCountList = []
        for logName in logNameList:
            lineCount = h2o_util.file_line_count(get_sandbox_name() + "/" +
                                                 logName)
            print logName, "lineCount:", lineCount
            lineCountList.append(lineCount)

        print logNameList

        if len(h2o_nodes.nodes) != len(logNameList):
            raise Exception("Should be %d logs, are %d" % len(h2o_nodes.nodes),
                            len(logNameList))
        # line counts seem to vary..check for "too small"
        # variance in polling (cloud building and status)?
        for i, l in enumerate(lineCountList):
            if l < expectedMinLines:
                raise Exception("node %d %s log is too small" %
                                (i, logNameList[i]))
        return (logNameList, lineCountList)

    # just asssume the main ones meet the min requirement..and the error ones are min 0
    (logNameList, lineCountList) = checkit("-1-trace", expectedMinLines)
    checkit("-2-debug", expectedMinLines)
    checkit("-3-info", expectedMinLines)
    checkit("-4-warn", 0)
    checkit("-5-error", 0)
    checkit("-6-fatal", 0)
    # now that all the logs are there
    check_sandbox_for_errors()
    return (logNameList, lineCountList)
예제 #17
0
파일: h2o_nn.py 프로젝트: 100star/h2o
def checkLastValidationError(self, model, rows, expectedErr, relTol, **kwargs):
    errsLast = model['validation_errors'][-1] # last scoring result
    verboseprint("Deep Learning 'Last scoring on test set:'", dump_json(errsLast))
    expectedSamples = rows * kwargs['epochs']
    print 'Expecting ' + format(expectedSamples) + ' training samples'
    if errsLast['training_samples'] != expectedSamples:
        raise Exception("Number of training samples should be equal to %s" % expectedSamples)

    print "Expected test set error: " + format(expectedErr)
    print "Actual   test set error: " + format(errsLast['classification'])
    if errsLast['classification'] != expectedErr and abs((expectedErr - errsLast['classification'])/expectedErr) > relTol:
        raise Exception("Test set classification error of %s is not within %s %% relative error of %s" % (errsLast['classification'], float(relTol)*100, expectedErr))

    warnings = None

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings)
예제 #18
0
def do_json_request(addr=None,
                    port=None,
                    jsonRequest=None,
                    params=None,
                    timeout=7,
                    **kwargs):
    if params is not None:
        paramsStr = '?' + '&'.join(
            ['%s=%s' % (k, v) for (k, v) in params.items()])
    else:
        paramsStr = ''

    url = create_url(addr, port, jsonRequest)
    print 'Start ' + url + paramsStr
    try:
        r = requests.get(url, timeout=timeout, params=params, **kwargs)
        # the requests json decoder might fail if we didn't get something good
        rjson = r.json()
        emsg = "ERROR: Probing claimed existing cloud with Cloud.json"
        if not isinstance(rjson, (list, dict)):
            # probably good
            raise Exception(emsg + "h2o json responses should always be lists or dicts. Got %s" %\
                dump_json(rj))
        elif r.status_code != requests.codes.ok:
            rjson = None
            raise Exception(emsg +
                            "Couldn't decode. Status: %s" % r.status_code)

    except requests.ConnectionError, e:
        rjson = None
        emsg = "ERROR: json got ConnectionError or other exception"
        # Rethrow the exception after we've checked for stack trace from h2o.
        # Out of memory errors maybe don't show up right away?
        # so we should wait for h2o to get it out to h2o stdout.
        # Don't want to rely on cloud teardown to check because there's no delay,
        # and we don't want to delay all cloud teardowns by waiting.
        exc_info = sys.exc_info()
        # we don't expect to have connection errors, so any exception is a bad thing.
        h2p.red_print("%s\n %s\n %s\nGoing to check sandbox, then rethrow.." %
                      (emsg, exc_info, url + paramsStr))
        time.sleep(2)
        check_sandbox_for_errors()
        raise exc_info[1], None, exc_info[2]
예제 #19
0
파일: h2o_log.py 프로젝트: 100star/h2o
def checkH2OLogs(timeoutSecs=3, expectedMinLines=12, suffix="-1-trace"):
    # download logs from node 0 (this will overwrite)
    h2o_nodes.nodes[0].log_download(timeoutSecs=timeoutSecs)

    # I guess we really don't need to get the list of nodes names from get_cloud any more
    # h2o_172.16.2.222_54321-1-trace.log
    # h2o_172.16.2.222_54321-2-debug.log
    # h2o_172.16.2.222_54321-3-info.log
    # h2o_172.16.2.222_54321-4-warn.log
    # h2o_172.16.2.222_54321-5-error.log
    # h2o_172.16.2.222_54321-6-fatal.log
    def checkit(suffix, expectedMinLines):
        logNameList = ["h2o_" + str(n.http_addr) + "_" + str(n.port) + suffix + ".log" for n in h2o_nodes.nodes]
        lineCountList = []
        for logName in logNameList:
            lineCount = h2o_util.file_line_count(get_sandbox_name() + "/" + logName)
            print logName, "lineCount:", lineCount
            lineCountList.append(lineCount)

        print logNameList

        if len(h2o_nodes.nodes) != len(logNameList):
            raise Exception("Should be %d logs, are %d" % len(h2o_nodes.nodes), len(logNameList))
        # line counts seem to vary..check for "too small"
        # variance in polling (cloud building and status)?
        for i, l in enumerate(lineCountList):
            if l < expectedMinLines:
                raise Exception("node %d %s log is too small" % (i, logNameList[i]))
        return (logNameList, lineCountList)

    # just asssume the main ones meet the min requirement..and the error ones are min 0
    (logNameList, lineCountList) = checkit("-1-trace", expectedMinLines)
    checkit("-2-debug", expectedMinLines)
    checkit("-3-info", expectedMinLines)
    checkit("-4-warn", 0)
    checkit("-5-error", 0)
    checkit("-6-fatal", 0)
    # now that all the logs are there
    check_sandbox_for_errors()
    return (logNameList, lineCountList)
예제 #20
0
def import_parse(node=None, schema='local', bucket=None, path=None,
    src_key=None, hex_key=None, 
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, 
    importParentDir=True, **kwargs):

    if not node: node = h2o_nodes.nodes[0]

    (importResult, importPattern) = import_only(node, schema, bucket, path,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs)

    verboseprint("importPattern:", importPattern)
    verboseprint("importResult", dump_json(importResult))

    parseResult = parse_only(node, importPattern, hex_key,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, **kwargs)
    verboseprint("parseResult:", dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    # only if not noPoll. otherwise parse isn't done
    if doSummary and not noPoll:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        check_sandbox_for_errors()
        print "WARNING: not doing inspect/summary for now after parse"
        ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs)
        ## numRows = inspect['numRows']
        ## numCols = inspect['numCols']
        # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma)
        ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols)
        # for now, don't worry about error isolating summary 
    else:
        # isolate a parse from the next thing
        check_sandbox_for_errors()

    return parseResult
예제 #21
0
def do_json_request(addr=None, port=None,  jsonRequest=None, params=None, timeout=7, **kwargs):
    if params is not None:
        paramsStr =  '?' + '&'.join(['%s=%s' % (k,v) for (k,v) in params.items()])
    else:
        paramsStr = ''

    url = create_url(addr, port, jsonRequest)
    print 'Start ' + url + paramsStr
    try:
        r = requests.get(url, timeout=timeout, params=params, **kwargs)
        # the requests json decoder might fail if we didn't get something good
        rjson = r.json()
        emsg = "ERROR: Probing claimed existing cloud with Cloud.json"
        if not isinstance(rjson, (list,dict)):
            # probably good
            raise Exception(emsg + "h2o json responses should always be lists or dicts. Got %s" %\
                dump_json(rj))
        elif r.status_code != requests.codes.ok:
            rjson = None
            raise Exception(emsg + "Couldn't decode. Status: %s" % r.status_code)

    except requests.ConnectionError, e:
        rjson = None
        emsg = "ERROR: json got ConnectionError or other exception"
        # Rethrow the exception after we've checked for stack trace from h2o.
        # Out of memory errors maybe don't show up right away? 
        # so we should wait for h2o to get it out to h2o stdout. 
        # Don't want to rely on cloud teardown to check because there's no delay, 
        # and we don't want to delay all cloud teardowns by waiting.
        exc_info = sys.exc_info()
        # we don't expect to have connection errors, so any exception is a bad thing.
        h2p.red_print(
            "%s\n %s\n %s\nGoing to check sandbox, then rethrow.." % (emsg, exc_info, url + paramsStr))
        time.sleep(2)
        check_sandbox_for_errors()
        raise exc_info[1], None, exc_info[2]
예제 #22
0
def checkLastValidationError(self, model, rows, expectedErr, relTol, **kwargs):
    errsLast = model['validation_errors'][-1]  # last scoring result
    verboseprint("Deep Learning 'Last scoring on test set:'",
                 dump_json(errsLast))
    expectedSamples = rows * kwargs['epochs']
    print 'Expecting ' + format(expectedSamples) + ' training samples'
    if errsLast['training_samples'] != expectedSamples:
        raise Exception("Number of training samples should be equal to %s" %
                        expectedSamples)

    print "Expected test set error: " + format(expectedErr)
    print "Actual   test set error: " + format(errsLast['classification'])
    if errsLast['classification'] != expectedErr and abs(
        (expectedErr - errsLast['classification']) / expectedErr) > relTol:
        raise Exception(
            "Test set classification error of %s is not within %s %% relative error of %s"
            % (errsLast['classification'], float(relTol) * 100, expectedErr))

    warnings = None

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings)
예제 #23
0
파일: h2o_bc.py 프로젝트: AI-Cdrone/h2o
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False):
    if h2o_args.sleep_at_tear_down:
        print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)"
        import h2o_browse as h2b
        h2b.browseTheCloud()
        sleep(3600)

    # we keep a copy of whatever was built here too, just in case!
    # we can't refer to h2o.nodes[] because of circular import?
    if not nodeList: nodeList = h2o_nodes.nodes
    # could the nodeList still be empty in some exception cases? Assume not for now
    try:
        # update: send a shutdown to all nodes. h2o maybe doesn't progagate well if sent to one node
        # the api watchdog shouldn't complain about this?
        for n in nodeList:
            n.shutdown_all()
    except:
        pass
    # ah subtle. we might get excepts in issuing the shutdown, don't abort out
    # of trying the process kills if we get any shutdown exception (remember we go to all nodes)
    # so we might? nodes are shutting down?
    # FIX! should we wait a bit for a clean shutdown, before we process kill?
    # It can take more than 1 sec though.
    try:
        time.sleep(2)
        for n in nodeList:
            n.terminate()
            verboseprint("tear_down_cloud n:", n)
    except:
        pass

    check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name)
    # get rid of all those pesky line marker files. Unneeded now
    clean_sandbox_doneToLine()
    nodeList[:] = []
    h2o_nodes.nodes = []
예제 #24
0
def exec_expr_list_rand(lenNodes, exprList, keyX, 
    # exec2 uses R "start with 1" behavior?
    minCol=1, maxCol=55, 
    minRow=1, maxRow=400000, 
    maxTrials=200, 
    timeoutSecs=10, ignoreH2oError=False, allowEmptyResult=False, nanOkay=False):

    trial = 0
    while trial < maxTrials: 
        exprTemplate = random.choice(exprList)

        # UPDATE: all execs are to a single node. No mixed node streams
        # eliminates some store/store race conditions that caused problems.
        # always go to node 0 (forever?)
        if lenNodes is None:
            execNode = 0
        else:
            # execNode = random.randint(0,lenNodes-1)
            execNode = 0
        ## print "execNode:", execNode

        colX = random.randint(minCol,maxCol)

        # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now
        row = str(random.randint(minRow,maxRow))

        execExpr = fill_in_expr_template(exprTemplate, colX, ((trial+1)%4)+1, row, keyX)
        (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, 
            timeoutSecs, ignoreH2oError)

        checkScalarResult(resultExec, None, allowEmptyResult=allowEmptyResult, nanOkay=nanOkay)

        if keyX:
            inspect = h2o_cmd.runInspect(key=keyX)
            print keyX, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

        sys.stdout.write('.')
        sys.stdout.flush()

        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        # slows things down to check every iteration, but good for isolation
        if check_sandbox_for_errors():
            raise Exception(
                "Found errors in sandbox stdout or stderr, on trial #%s." % trial)
        trial += 1
        print "Trial #", trial, "completed\n"
예제 #25
0
def exec_expr_list_rand(lenNodes, exprList, keyX, 
    # exec2 uses R "start with 1" behavior?
    minCol=1, maxCol=55, 
    minRow=1, maxRow=400000, 
    maxTrials=200, 
    timeoutSecs=10, ignoreH2oError=False, allowEmptyResult=False, nanOkay=False):

    trial = 0
    while trial < maxTrials: 
        exprTemplate = random.choice(exprList)

        # UPDATE: all execs are to a single node. No mixed node streams
        # eliminates some store/store race conditions that caused problems.
        # always go to node 0 (forever?)
        if lenNodes is None:
            execNode = 0
        else:
            # execNode = random.randint(0,lenNodes-1)
            execNode = 0
        ## print "execNode:", execNode

        colX = random.randint(minCol,maxCol)

        # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now
        row = str(random.randint(minRow,maxRow))

        execExpr = fill_in_expr_template(exprTemplate, colX, ((trial+1)%4)+1, row, keyX)
        (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, 
            timeoutSecs, ignoreH2oError)

        checkScalarResult(resultExec, None, allowEmptyResult=allowEmptyResult, nanOkay=nanOkay)

        if keyX:
            inspect = h2o_cmd.runInspect(key=keyX)
            print keyX, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

        sys.stdout.write('.')
        sys.stdout.flush()

        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        # slows things down to check every iteration, but good for isolation
        if check_sandbox_for_errors():
            raise Exception(
                "Found errors in sandbox stdout or stderr, on trial #%s." % trial)
        trial += 1
        print "Trial #", trial, "completed\n"
예제 #26
0
def exec_expr_list_across_cols(lenNodes,
                               exprList,
                               keyX,
                               minCol=0,
                               maxCol=55,
                               timeoutSecs=10,
                               incrementingResult=True,
                               **kwargs):
    colResultList = []
    for colX in range(minCol, maxCol):
        for i, exprTemplate in enumerate(exprList):

            # do each expression at a random node, to facilate key movement
            # UPDATE: all execs are to a single node. No mixed node streams
            # eliminates some store/store race conditions that caused problems.
            # always go to node 0 (forever?)
            if lenNodes is None:
                execNode = 0
            else:
                ### execNode = random.randint(0,lenNodes-1)
                ### print execNode
                execNode = 0

            execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX)
            if incrementingResult:  # the Result<col> pattern
                resultKey = "Result" + str(colX)
            else:  # assume it's a re-assign to self
                resultKey = keyX

            # v2
            (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode],
                                             execExpr, None, timeoutSecs,
                                             **kwargs)
            # print "\nexecResult:", dump_json(resultExec)

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            # slows things down to check every iteration, but good for isolation
            if check_sandbox_for_errors():
                raise Exception(
                    "Found errors in sandbox stdout or stderr, on trial #%s." %
                    trial)

        ### print "Column #", colX, "completed\n"
        colResultList.append(result)

    return colResultList
예제 #27
0
def exec_expr_list_across_cols(lenNodes, exprList, keyX, 
    minCol=0, maxCol=54, timeoutSecs=10, incrementingResult=True):
    colResultList = []
    for colX in range(minCol, maxCol):
        for i, exprTemplate in enumerate(exprList):

            # do each expression at a random node, to facilate key movement
            # UPDATE: all execs are to a single node. No mixed node streams
            # eliminates some store/store race conditions that caused problems.
            # always go to node 0 (forever?)
            if lenNodes is None:
                execNode = 0
            else:
                ### execNode = random.randint(0,lenNodes-1)
                ### print execNode
                execNode = 0

            execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX)
            if incrementingResult: # the Result<col> pattern
                resultKey = "Result"+str(colX)
            else: # assume it's a re-assign to self
                resultKey = keyX

            # v2
            (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs)
            print "\nexecResult:", dump_json(resultExec)

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            # slows things down to check every iteration, but good for isolation
            if check_sandbox_for_errors():
                raise Exception(
                    "Found errors in sandbox stdout or stderr, on trial #%s." % trial)

        print "Column #", colX, "completed\n"
        colResultList.append(result)

    return colResultList
예제 #28
0
파일: h2o_glm.py 프로젝트: smarthomekit/h2o
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors. 
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    GLMModel = glm['glm_model']
    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" % dump_json(glm))

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    GLMParams = GLMModel['glm']
    family = GLMParams["family"]

    # number of submodels = number of lambda
    # min of 2. lambda_max is first
    submodels = GLMModel['submodels']
    # since all our tests?? only use one lambda, the best_lamda_idx should = 1
    best_lambda_idx = GLMModel['best_lambda_idx']
    print "best_lambda_idx:", best_lambda_idx
    lambda_max = GLMModel['lambda_max']
    print "lambda_max:", lambda_max

    # currently lambda_max is not set by tomas. ..i.e.not valid
    if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value):
        raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value))

    # submodels0 = submodels[0]
    # submodels1 = submodels[-1] # hackery to make it work when there's just one

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels)))

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels)))

    submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one
    iterations = submodels1['iteration']


    print "GLMModel/iterations:", iterations

            # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations  > maxExpectedIterations:
            raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )

    if 'validation' not in submodels1:
        raise Exception("Should be a 'validation' key in submodels1: %s" % dump_json(submodels1))
    validationsList = submodels1['validation']
    validations = validationsList
        
    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    print "GLMModel/validations"        
    validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
    validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])        
    print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
    print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        best_threshold = validations['best_threshold']
        thresholds = validations['thresholds']
        print "%15s %s" % ("best_threshold:\t", best_threshold)

        # have to look up the index for the cm, from the thresholds list
        best_index = None

        for i,t in enumerate(thresholds):
            if t >= best_threshold: # ends up using next one if not present
                best_index = i
                break
            
        assert best_index!=None, "%s %s" % (best_threshold, thresholds)
        print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

        # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
        submodels = glm['glm_model']['submodels']
        # FIX! this isn't right if we have multiple lambdas? different submodels?
        cms = submodels[0]['validation']['_cms']
        self.assertEqual(len(thresholds), len(cms), 
            msg="thresholds %s and cm %s should be lists of the same size. %s" % (len(thresholds), len(cms), thresholds))
        # FIX! best_threshold isn't necessarily in the list. jump out if >=
        assert best_index<len(cms), "%s %s" % (best_index, len(cms))
        # if we want 0.5..rounds to int
        # mid = len(cms)/2
        # cm = cms[mid]
        cm = cms[best_index]

        print "cm:", dump_json(cm['_arr'])
        predErr = cm['_predErr']
        classErr = cm['_classErr']
        # compare to predErr
        pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
        print "predErr:", predErr
        print "calculated pctWrong from cm:", pctWrong
        print "classErr:", classErr

        # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm['_arr'])


    if family=="poisson" or family=="gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    coefficients_names = GLMModel['coefficients_names']
    # print "coefficients_names:", coefficients_names
    idxs = submodels1['idxs']
    print "idxs:", idxs
    coefficients_names = coefficients_names

    # always check both normalized and normal coefficients
    norm_beta = submodels1['norm_beta']
    # if norm_beta and len(coefficients_names)!=len(norm_beta):
    #    print len(coefficients_names), len(norm_beta)
    #    raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta))
#
    beta = submodels1['beta']
    # print "beta:", beta
    # if len(coefficients_names)!=len(beta):
    #    print len(coefficients_names), len(beta)
    #    raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta))


    # test wants to use normalized?
    if doNormalized:
        beta_used = norm_beta
    else:
        beta_used = beta

    coefficients = {}
    # create a dictionary with name, beta (including intercept) just like v1

    for i,b in zip(idxs, beta_used[:-1]):
        name = coefficients_names[i]
        coefficients[name] = b

    print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
    print  "coefficients:", coefficients
    print  "beta:", beta
    print  "norm_beta:", norm_beta

    coefficients['Intercept'] = beta_used[-1]
    print "len(coefficients_names)", len(coefficients_names)
    print "len(idxs)", len(idxs)
    print "idxs[-1]", idxs[-1]
    print "intercept demapping info:", \
        "coefficients_names[-i]:", coefficients_names[-1], \
        "idxs[-1]:", idxs[-1], \
        "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
        "beta_used[-1]:", beta_used[-1], \
        "coefficients['Intercept']", coefficients['Intercept']

    # last one is intercept
    interceptName = coefficients_names[idxs[-1]]
    if interceptName != "Intercept" or abs(beta_used[-1])<1e-26:
        raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\
            (idxs[-1], beta_used[-1], "-"+interceptName+"-"))

    # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names
    # new 5/28/14. glm can point to zero coefficients
    # for i in idxs:
    #     if beta_used[i]==0.0:
    ##        raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
    if len(idxs) > len(beta_used):
        raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used)))
    intercept = coefficients.pop('Intercept', None)

    # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
    # the last one shoudl be 'Intercept' ?
    coefficients_names.pop()

    # have to skip the output col! get it from kwargs
    # better always be there!
    y = kwargs['response']

    # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names'
    # from the response
    # Tomas created 'coefficients_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before
    # he gets it? 
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint: 
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # coefficients_names is input only now..same for header or no header, or expanded enums
    for c in coefficients_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint: 
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(coefficients_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        # add kwargs to help debug without looking at console log
        self.assertGreater(absXCoeff, 1e-26, (
            "abs. value of GLM coefficients['" + str(colX) + "'] is " +
            str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) +  "\n" +
            "kwargs:" + dump_json(kwargs)
            ))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coefficients['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" +
        "kwargs:" + dump_json(kwargs)
        ))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients)>0):
        maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey]
        minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey]
    else: 
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero, 
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients)>1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26\n" +
            "kwargs:" + dump_json(kwargs)
            ))

    print "submodels1, run_time (milliseconds):", submodels1['run_time']

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, cList, intercept)
예제 #29
0
def pollWaitJobs(pattern=None,
                 errorIfCancelled=False,
                 timeoutSecs=60,
                 pollTimeoutSecs=60,
                 retryDelaySecs=5,
                 benchmarkLogging=None,
                 stallForNJobs=None):
    wait = True
    waitTime = 0
    ignoredJobs = set()
    while (wait):
        a = h2o_nodes.nodes[0].jobs(timeoutSecs=pollTimeoutSecs)
        verboseprint("jobs():", dump_json(a))
        jobs = a['jobs']
        busy = 0
        for j in jobs:
            cancelled = j['status'] == 'CANCELLED'
            description = j['description']
            key = j['key']
            jobKey = key['name']
            jobKeyType = key['type']

            #          "key": {
            #            "URL": "/3/Jobs.json/$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2",
            #            "__meta": {
            #              "schema_name": "JobKeyV1",
            #              "schema_type": "Key<Job>",
            #              "schema_version": 1
            #            },
            #            "name": "$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2",
            #            "type": "Key<Job>"
            #
            progress = j['progress']
            progress_msg = j['progress_msg']

            # has exception and val?
            start_time = j['start_time']
            end_time = j.get('end_time', None)
            dest = j['dest']
            description = j['description']
            msec = j.get('msec', None)

            # for now, don't ignore any exceptions

            # FIX! what do exceptions look like now?
            if 'exception' in j and j['exception']:
                check_sandbox_for_errors()
                msg = "ERROR: pollWaitJobs found a job with a exception result when it shouldn't have:\n %s" % dump_json(
                    j)
                raise Exception(msg)

            if errorIfCancelled and cancelled:
                check_sandbox_for_errors()
                print(
                    "ERROR: not stopping, but: pollWaitJobs found a cancelled job when it shouldn't have:\n %s"
                    % dump_json(j))
                print(
                    "Continuing so maybe a json response will give more info")

            ### verboseprint(j)
            # don't include cancelled jobs here
            elif j['status'] != 'DONE':
                if not pattern:
                    # always print progress if busy job (no pattern used
                    print "time:", time.strftime(
                        "%I:%M:%S"), "progress:", progress, dest
                    verboseprint("description:", description, "end_time:",
                                 end_time)
                    busy += 1
                    verboseprint("pollWaitJobs: found a busy job, now: %s" %
                                 busy)
                else:
                    if (pattern in key) or (pattern
                                            in dest) or (pattern
                                                         in description):
                        ## print "description:", description, "end_time:", end_time
                        busy += 1
                        verboseprint(
                            "pollWaitJobs: found a pattern-matched busy job, now %s"
                            % busy)
                        # always print progress if pattern is used and matches
                        print "time:", time.strftime(
                            "%I:%M:%S"), "progress:", progress, dest
                    # we only want to print the warning message once
                    elif key not in ignoredJobs:
                        jobMsg = "%s %s %s" % (key, description, dest)
                        verboseprint(
                            " %s job in progress but we're ignoring it. Doesn't match pattern."
                            % jobMsg)
                        # I guess "key" is supposed to be unique over all time for a job id?
                        ignoredJobs.add(key)

        if stallForNJobs:
            waitFor = stallForNJobs
        else:
            waitFor = 0

        print " %s jobs in progress." % busy, "Waiting until %s in progress." % waitFor
        wait = busy > waitFor
        if not wait:
            break

        ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs")
        if not h2o_args.no_timeout and (wait and waitTime > timeoutSecs):
            print dump_json(jobs)
            raise Exception("Some queued jobs haven't completed after",
                            timeoutSecs, "seconds")

        sys.stdout.write('.')
        sys.stdout.flush()
        time.sleep(retryDelaySecs)
        waitTime += retryDelaySecs

        # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack)
        # test would pass ['cpu','disk','jstack'] kind of list
        if benchmarkLogging:
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)

        # check the sandbox for stack traces! just like we do when polling normally
        check_sandbox_for_errors()

    patternKeys = []
    for j in jobs:
        # save the destination keys in progress that match pattern (for returning)
        if pattern and pattern in j['dest']:
            patternKeys.append(j['dest'])

    return patternKeys
예제 #30
0
def import_parse(node=None,
                 schema='local',
                 bucket=None,
                 path=None,
                 src_key=None,
                 hex_key=None,
                 timeoutSecs=30,
                 retryDelaySecs=0.1,
                 initialDelaySecs=0,
                 pollTimeoutSecs=180,
                 noise=None,
                 benchmarkLogging=None,
                 noPoll=False,
                 doSummary=True,
                 noPrint=True,
                 importParentDir=True,
                 **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.
    # if schema=='put':
    #    h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +
    #        "\nMeans multi-machine with 'put' will fail")
    #    schema = 'local'

    if not node: node = h2o_nodes.nodes[0]
    (importResult,
     importPattern) = import_only(node, schema, bucket, path, timeoutSecs,
                                  retryDelaySecs, initialDelaySecs,
                                  pollTimeoutSecs, noise, benchmarkLogging,
                                  noPoll, doSummary, src_key, noPrint,
                                  importParentDir, **kwargs)

    verboseprint("importPattern:", importPattern)
    verboseprint("importResult", dump_json(importResult))

    assert len(
        importResult['keys']
    ) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path)
    # print "importResult:", importResult

    # get rid of parse timing in tests now
    start = time.time()
    parseResult = parse_only(node, importPattern, hex_key,
                             importResult['keys'], timeoutSecs, retryDelaySecs,
                             initialDelaySecs, pollTimeoutSecs, noise,
                             benchmarkLogging, noPoll, **kwargs)
    elapsed = time.time() - start
    print importPattern, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % (
        (elapsed * 100) / timeoutSecs), "\n"
    parseResult['python_elapsed'] = elapsed

    verboseprint("parseResult:", dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    # only if not noPoll. otherwise parse isn't done
    if doSummary and not noPoll:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        check_sandbox_for_errors()
        print "WARNING: not doing inspect/summary for now after parse"
        ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs)
        ## numRows = inspect['numRows']
        ## numCols = inspect['numCols']
        # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma)
        ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols)
        # for now, don't worry about error isolating summary
    else:
        # isolate a parse from the next thing
        check_sandbox_for_errors()

    return parseResult
예제 #31
0
파일: h2o_bc.py 프로젝트: RogerKamena/h2o-3
def build_cloud_with_json(h2o_nodes_json="h2o-nodes.json"):

    # local sandbox may not exist. Don't clean if it does, just append
    if not os.path.exists(LOG_DIR):
        os.mkdir(LOG_DIR)

    log("#*********************************************************************")
    log("Starting new test: " + h2o_args.python_test_name + " at build_cloud_with_json()")
    log("#*********************************************************************")

    print "This only makes sense if h2o is running as defined by", h2o_nodes_json
    print "For now, assuming it's a cloud on this machine, and here's info on h2o processes running here"
    print "No output means no h2o here! Some other info about stuff on the system is printed first though."
    import h2o_os_util

    if not os.path.exists(h2o_nodes_json):
        raise Exception("build_cloud_with_json: Can't find " + h2o_nodes_json + " file")

    ## h2o_os_util.show_h2o_processes()

    with open(h2o_nodes_json, "rb") as f:
        cloneJson = json.load(f)

    # These are supposed to be in the file.
    # Just check the first one. if not there, the file must be wrong
    if not "cloud_start" in cloneJson:
        raise Exception("Can't find 'cloud_start' in %s, wrong file? h2o-nodes.json?" % h2o_nodes_json)
    else:
        cs = cloneJson["cloud_start"]
        print "Info on the how the cloud we're cloning was started (info from %s)" % h2o_nodes_json
        # required/legal values in 'cloud_start'. A robust check is good for easy debug when we add stuff
        valList = ["time", "cwd", "python_test_name", "python_cmd_line", "config_json", "username", "ip"]
        for v in valList:
            if v not in cs:
                raise Exception("Can't find %s in %s, wrong file or version change?" % (v, h2o_nodes_json))
            print "cloud_start['%s']: %s" % (v, cs[v])

        # this is the internal node state for python..nodes rebuild
        nodeStateList = cloneJson["h2o_nodes"]

    nodeList = []
    if not nodeStateList:
        raise Exception("nodeStateList is empty. %s file must be empty/corrupt" % h2o_nodes_json)

    try:
        for nodeState in nodeStateList:
            print "Cloning state for node", nodeState["node_id"], "from", h2o_nodes_json

            newNode = ExternalH2O(nodeState)
            nodeList.append(newNode)

        # If it's an existing cloud, it may already be locked. so never check.
        # we don't have the cloud name in the -ccj since it may change (and the file be static?)
        # so don't check expectedCloudName
        verify_cloud_size(nodeList, expectedCloudName=None, expectedLocked=None)

        # best to check for any errors right away?
        # (we won't report errors from prior tests due to marker stuff?
        ## check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        # put the test start message in the h2o log, to create a marker
        nodeList[0].h2o_log_msg()

    except:
        # nodeList might be empty in some exception cases?
        # no shutdown issued first, though

        ## if cleanup and nodeList:
        ##     for n in nodeList: n.terminate()
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)
        raise

    # like cp -p. Save the config file, to sandbox
    print "Saving the ", h2o_nodes_json, "we used to", LOG_DIR
    shutil.copy(h2o_nodes_json, LOG_DIR + "/" + os.path.basename(h2o_nodes_json))

    print ""
    h2p.red_print("Ingested from json:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes")
    print ""

    # save it to a global copy, in case it's needed for tearDown
    h2o_nodes.nodes[:] = nodeList
    return nodeList
예제 #32
0
def poll_url(self, response,
             timeoutSecs=10, retryDelaySecs=0.5, initialDelaySecs=0, pollTimeoutSecs=180,
             noise=None, benchmarkLogging=None, noPoll=False, reuseFirstPollUrl=False, noPrint=False):
    verboseprint('poll_url input: response:', dump_json(response))
    ### print "poll_url: pollTimeoutSecs", pollTimeoutSecs
    ### print "at top of poll_url, timeoutSecs: ", timeoutSecs

    # for the rev 2 stuff..the job_key, destination_key and redirect_url are just in the response
    # look for 'response'..if not there, assume the rev 2

    def get_redirect_url(response):
        url = None
        params = None
        # StoreView has old style, while beta_features
        if 'response_info' in response: 
            response_info = response['response_info']

            if 'redirect_url' not in response_info:
                raise Exception("Response during polling must have 'redirect_url'\n%s" % dump_json(response))

            if response_info['status'] != 'done':
                redirect_url = response_info['redirect_url']
                if redirect_url:
                    url = self.url(redirect_url)
                    params = None
                else:
                    if response_info['status'] != 'done':
                        raise Exception(
                            "'redirect_url' during polling is null but status!='done': \n%s" % dump_json(response))
        else:
            if 'response' not in response:
                raise Exception("'response' not in response.\n%s" % dump_json(response))

            if response['response']['status'] != 'done':
                if 'redirect_request' not in response['response']:
                    raise Exception("'redirect_request' not in response. \n%s" % dump_json(response))

                url = self.url(response['response']['redirect_request'])
                params = response['response']['redirect_request_args']

        return (url, params)

    # if we never poll
    msgUsed = None

    if 'response_info' in response: # trigger v2 for GBM always?
        status = response['response_info']['status']
        progress = response.get('progress', "")
    else:
        r = response['response']
        status = r['status']
        progress = r.get('progress', "")

    doFirstPoll = status != 'done'
    (url, params) = get_redirect_url(response)
    # no need to recreate the string for messaging, in the loop..
    if params:
        paramsStr = '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()])
    else:
        paramsStr = ''

    # FIX! don't do JStack noise for tests that ask for it. JStack seems to have problems
    noise_enable = noise and noise != ("JStack", None)
    if noise_enable:
        print "Using noise during poll_url:", noise
        # noise_json should be like "Storeview"
        (noise_json, noiseParams) = noise
        noiseUrl = self.url(noise_json + ".json")
        if noiseParams is None:
            noiseParamsStr = ""
        else:
            noiseParamsStr = '&'.join(['%s=%s' % (k, v) for (k, v) in noiseParams.items()])

    start = time.time()
    count = 0
    if initialDelaySecs:
        time.sleep(initialDelaySecs)

    # can end with status = 'redirect' or 'done'
    # Update: on DRF2, the first RF redirects to progress. So we should follow that, and follow any redirect to view?
    # so for v2, we'll always follow redirects?
    # For v1, we're not forcing the first status to be 'poll' now..so it could be redirect or done?(NN score? if blocking)

    # Don't follow the Parse redirect to Inspect, because we want parseResult['destination_key'] to be the end.
    # note this doesn't affect polling with Inspect? (since it doesn't redirect ?
    while status == 'poll' or doFirstPoll or (status == 'redirect' and 'Inspect' not in url):
        count += 1
        if ((time.time() - start) > timeoutSecs):
            # show what we're polling with
            emsg = "Exceeded timeoutSecs: %d secs while polling." % timeoutSecs + \
                   "status: %s, url: %s?%s" % (status, urlUsed, paramsUsedStr)
            raise Exception(emsg)

        if benchmarkLogging:
            import h2o
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)

        # every other one?
        create_noise = noise_enable and ((count % 2) == 0)
        if create_noise:
            urlUsed = noiseUrl
            paramsUsed = noiseParams
            paramsUsedStr = noiseParamsStr
            msgUsed = "\nNoise during polling with"
        else:
            urlUsed = url
            paramsUsed = params
            paramsUsedStr = paramsStr
            msgUsed = "\nPolling with"

        print status, progress, urlUsed
        time.sleep(retryDelaySecs)

        response = self.do_json_request(fullUrl=urlUsed, timeout=pollTimeoutSecs, params=paramsUsed)
        verboseprint(msgUsed, urlUsed, paramsUsedStr, "Response:", dump_json(response))
        # hey, check the sandbox if we've been waiting a long time...rather than wait for timeout
        if ((count % 6) == 0):
            check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        if (create_noise):
            # this guarantees the loop is done, so we don't need to worry about
            # a 'return r' being interpreted from a noise response
            status = 'poll'
            progress = ''
        else:
            doFirstPoll = False
            status = response['response_info']['status']
            progress = response.get('progress', "")
            # get the redirect url
            if not reuseFirstPollUrl: # reuse url for all v1 stuff
                (url, params) = get_redirect_url(response)

            if noPoll:
                return response

    # won't print if we didn't poll
    if msgUsed:
        verboseprint(msgUsed, urlUsed, paramsUsedStr, "Response:", dump_json(response))
    return response
예제 #33
0
    def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs):
        super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint)

        print self.withinmse # per cluster
        print self.avgss
        print self.avgwithinss
        print self.avgbetweenss

        # should model builder add this to the kmeansResult?
        if 'python_elapsed' in kmeansResult:
            self.python_elapsed = kmeansResult['python_elapsed']

        rows = self.rows # [78, 5, 41, 76]
        model_category = self.model_category # Clustering
        iters = self.iters # 11.0
        domains = self.domains # [None, None, None, None, None, None, None, None, None, None, None, None, None, None]
        names = self.names 
        # [u'STR', u'OBS', u'AGMT', u'FNDX', u'HIGD', u'DEG', u'CHK', u'AGP1', u'AGMN', u'NLV', u'LIV', u'WT', u'AGLP', u'MST']
        ncats = self.ncats # 0
        clusters = self.clusters # [ 4 lists of centers ]
        withinmse = self.withinmse
        avgss = self.avgss

        if numRows:
            assert numRows==sum(rows)

        if 'k' in parameters:
            k = parameters['k']
            assert len(clusters) == k
            assert len(rows) == k

        if numCols:
            assert len(names) == numCols, \
                "Need to pass correct numCols after ignored columns decrement %s %s" % (len(names), numCols)
            for c in clusters:
                assert len(c) == numCols, "%s %s" % (len(c), numCols)

        # this should be true 
        if labels:
            assert len(labels) == numCols, \
                "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
            assert len(labels) == len(names), \
                "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
            assert labels == names

        if 'max_iters' in parameters:
            max_iters = parameters['max_iters']
            assert max_iters >= iters

        # we could check the centers are within the min/max of each column
        for i,c in enumerate(clusters):
            for n in c:
                if math.isnan(float(n)):
                    raise Exception("cluster", i, "has NaN:", n, "center:", c)

        # shouldn't have any errors
        check_sandbox_for_errors()

        # create a tuple for each cluster result, then sort by rows for easy comparison
        # maybe should sort by centers?
        # put a cluster index in there too, (leftmost) so we don't lose track
        tuples = zip(range(len(clusters)), withinmse, rows, clusters)
        self.tuplesSorted = sorted(tuples, key=itemgetter(3))

        # undo for printing what the caller will see
        ids, withinmse, rows, clusters = zip(*self.tuplesSorted)
        print "iters:", iters
        print "ids:", ids
        print "withinmse:", withinmse
        print "rows:", rows
        for i,c in enumerate(clusters):
            print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)
        
        print "KMeansObj created for:", "???"# vars(self)
예제 #34
0
def build_cloud_with_json(h2o_nodes_json='h2o-nodes.json'):

    # local sandbox may not exist. Don't clean if it does, just append
    if not os.path.exists(LOG_DIR):
        os.mkdir(LOG_DIR)

    log("#*********************************************************************")
    log("Starting new test: " + h2o_args.python_test_name + " at build_cloud_with_json()")
    log("#*********************************************************************")

    print "This only makes sense if h2o is running as defined by", h2o_nodes_json
    print "For now, assuming it's a cloud on this machine, and here's info on h2o processes running here"
    print "No output means no h2o here! Some other info about stuff on the system is printed first though."
    import h2o_os_util

    if not os.path.exists(h2o_nodes_json):
        raise Exception("build_cloud_with_json: Can't find " + h2o_nodes_json + " file")

    ## h2o_os_util.show_h2o_processes()

    with open(h2o_nodes_json, 'rb') as f:
        cloneJson = json.load(f)

    # These are supposed to be in the file.
    # Just check the first one. if not there, the file must be wrong
    if not 'cloud_start' in cloneJson:
        raise Exception("Can't find 'cloud_start' in %s, wrong file? h2o-nodes.json?" % h2o_nodes_json)
    else:
        cs = cloneJson['cloud_start']
        print "Info on the how the cloud we're cloning was started (info from %s)" % h2o_nodes_json
        # required/legal values in 'cloud_start'. A robust check is good for easy debug when we add stuff
        valList = ['time', 'cwd', 'python_test_name', 'python_cmd_line', 'config_json', 'username', 'ip']
        for v in valList:
            if v not in cs:
                raise Exception("Can't find %s in %s, wrong file or version change?" % (v, h2o_nodes_json))
            print "cloud_start['%s']: %s" % (v, cs[v])

        # this is the internal node state for python..nodes rebuild
        nodeStateList = cloneJson['h2o_nodes']

    nodeList = []
    if not nodeStateList:
        raise Exception("nodeStateList is empty. %s file must be empty/corrupt" % h2o_nodes_json)

    try:
        for nodeState in nodeStateList:
            print "Cloning state for node", nodeState['node_id'], 'from', h2o_nodes_json

            newNode = ExternalH2O(nodeState)
            nodeList.append(newNode)

        # If it's an existing cloud, it may already be locked. so never check.
        # we don't have the cloud name in the -ccj since it may change (and the file be static?)
        # so don't check expectedCloudName
        verify_cloud_size(nodeList, expectedCloudName=None, expectedLocked=None)

        # best to check for any errors right away?
        # (we won't report errors from prior tests due to marker stuff?
        ## check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        # put the test start message in the h2o log, to create a marker
        nodeList[0].h2o_log_msg()

    except:
        # nodeList might be empty in some exception cases?
        # no shutdown issued first, though

        ## if cleanup and nodeList:
        ##     for n in nodeList: n.terminate()
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)
        raise

    # like cp -p. Save the config file, to sandbox
    print "Saving the ", h2o_nodes_json, "we used to", LOG_DIR
    shutil.copy(h2o_nodes_json, LOG_DIR + "/" + os.path.basename(h2o_nodes_json))

    print ""
    h2p.red_print("Ingested from json:",
        nodeList[0].java_heap_GB, "GB java heap(s) with",
        len(nodeList), "total nodes")
    print ""

    # save it to a global copy, in case it's needed for tearDown
    h2o_nodes.nodes[:] = nodeList
    return nodeList
예제 #35
0
    def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs):
        super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint)

        print self.withinss # per cluster
        print self.totss
        print self.tot_withinss
        print self.betweenss

        # should model builder add this to the kmeansResult?
        if 'python_elapsed' in kmeansResult:
            self.python_elapsed = kmeansResult['python_elapsed']

        size = self.size # [78, 5, 41, 76]
        model_category = self.model_category # Clustering
        iterations = self.iterations # 11.0
        domains = self.domains 
        names = self.names 
        categorical_column_count = self.categorical_column_count # 0
        centers_data = self.centers.data # [ 4 lists of centers ]
        # h2o returns it sliced across centers. transpose the list of lists, drop 0 which is the cluster id?
        # gotta turn the strings into numbers
        centersStr = [list(x) for x in zip(*centers_data[1:])]
        centers = [map(float, c) for c in centersStr]

        withinss = self.withinss
        totss = self.totss

        if numRows:
            assert numRows==sum(size)

        if 'k' in parameters:
            k = parameters['k']
            assert len(centers) == k
            assert len(size) == k

        if numCols:
            assert len(names) == numCols, \
                "Need to pass correct numCols after ignored columns decrement %s %s %s" % (len(names), numCols, names)
            for c in centers:
                assert len(c) == numCols, "%s %s" % (len(c), numCols)

        # this should be true 
        if labels:
            assert len(labels) == numCols, \
                "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
            assert len(labels) == len(names), \
                "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
            assert labels == names

        if 'max_iterations' in parameters:
            max_iterations = parameters['max_iterations']
            assert max_iterations >= iterations

        # we could check the centers are within the min/max of each column
        for i,c in enumerate(centers):
            for n in c:
                if math.isnan(float(n)):
                    raise Exception("cluster", i, "has NaN:", n, "center:", c)

        # create a tuple for each cluster result, then sort by rows for easy comparison
        # maybe should sort by centers?
        # put a cluster index in there too, (leftmost) so we don't lose track
        tuples = zip(range(len(centers)), centers, size, withinss)
        # print "tuples:", dump_json(tuples)
        # can we sort on the sum of the centers?
        self.tuplesSorted = sorted(tuples, key=lambda tup: sum(tup[1]))

        print "iterations:", iterations
        # undo for printing what the caller will see
        ids, centers, size, withinss = zip(*self.tuplesSorted)
        for i,c in enumerate(centers):
            print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)
            print "rows_per_cluster[%s]: " % i, size[i]
            print "withinss[%s]: " % i, withinss[i]
            print "size[%s]:" % i, size[i]

        print "KMeansObj created for:", "???"# vars(self)

        # shouldn't have any errors
        check_sandbox_for_errors()
예제 #36
0
파일: h2o_pca.py 프로젝트: smarthomekit/h2o
def simpleCheckPCA(self, pca, **kwargs):
    #print dump_json(pca)
    warnings = None
    if 'warnings' in pca:
        warnings = pca['warnings']
        # catch the 'Failed to converge" for now
        x = re.compile("[Ff]ailed")
        for w in warnings:
            print "\nwarning:", w
            if re.search(x, w): raise Exception(w)

    # Check other things in the json response dictionary 'pca' here
    pcaResult = pca
    verboseprint('pcaResult Inspect:', dump_json(pcaResult))

    #Check no NaN in sdevs, propVars, or in PCs
    print "Checking sdevs..."
    sdevs = pcaResult["pca_model"]["sdev"]
    verboseprint("pca sdevs:", dump_json(sdevs))

    # sdevs is supposed to be a list sorted by s
    # sFirst = sdevs[0].s
    for PC, s in enumerate(sdevs):
        if math.isnan(s):
            raise Exception("sdev %s is NaN: %s" % (PC, s))
        # anqi says the list should be sorted..i.e. first first
        ## if s < sFirst:
        ##     raise Exception("sdev %s %s is > sFirst %s. Supposed to be sorted?" % (PC, s, sFirst))

    print "Checking propVars...",
    propVars = pcaResult["pca_model"]["propVar"]
    verboseprint("pca propVars:", dump_json(propVars))
    for PC, propvar in enumerate(propVars):
        if math.isnan(propvar):
            raise Exception("propVar %s is NaN: %s", (PC, propvar))
    print " Good!"
    print "Checking eigVec...",
    pcs = pcaResult["pca_model"]["eigVec"]
    verboseprint("pca eigVec:", dump_json(pcs))
    for i, s in enumerate(pcs):
        for r, e in enumerate(s):
            if math.isnan(e):
                raise Exception("Component %s has NaN: %s eigenvector %s",
                                (i, e, s))
    print " Good!"

    print "How many components did we get? (after enum col dropping): %s" % len(
        pcs)

    # now print the top ten. Sorting by the value...getting key,value tuples (so we can see the column)
    # it should match the column numbering..even if it skips cols due to enums
    import operator
    print "Just look at the sort for the first row in pca eigVec"
    i = 0
    s = pcs[i]
    # print "s:", s
    unsorted_s = [(i, j) for i, j in enumerate(s)]
    sorted_s = sorted(unsorted_s, key=lambda t: abs(t[1]), reverse=True)
    print "\n%s First (larger). sorted_s: %s\n" % (i, sorted_s)
    print "The last entry from the eigenvector, should have the largest std dev, because it's sorted"
    print "Rule of thumb is we can then look at the sorted values, and guess it's related to column importance"
    print "The sort should be on the abs(), since the signs can be + or -"

    # shouldn't have any errors
    check_sandbox_for_errors()
    return warnings
예제 #37
0
def runExec(node=None, timeoutSecs=20, **kwargs):
    if not node: node = h2o_nodes.nodes[0]
    # no such thing as GLMView..don't use retryDelaySecs
    a = node.exec_query(timeoutSecs, **kwargs)
    check_sandbox_for_errors()
    return a
예제 #38
0
def pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=5):
    busy = True
    trials = 0

    start = time.time()
    polls = 0
    statSum = {}
    # just init for worst case 64 nodes?
    lastUsedMemBytes = [1 for i in range(64)]
    while busy:
        polls += 1
        # get utilization and print it
        # any busy jobs
        a = h2o_nodes.nodes[0].jobs(timeoutSecs=60)
        busy = False
        for j in a['jobs']:
            msec = j.get('msec', None)
            if j['status']!='DONE':
                busy = True
                verboseprint("Still busy")
                break

        cloudStatus = h2o_nodes.nodes[0].get_cloud(timeoutSecs=timeoutSecs)
        nodes = cloudStatus['nodes']
        for i,n in enumerate(nodes):

            # check for drop in tot_mem_bytes, and report as "probably post GC"
            totMemBytes = n['tot_mem_bytes']
            maxMemBytes = n['max_mem_bytes']
            freeMemBytes = n['free_mem_bytes']

            usedMemBytes = totMemBytes - freeMemBytes
            availMemBytes = maxMemBytes - usedMemBytes
            print 'Node %s:' % i, \
                'num_cpus:', n['num_cpus'],\
                'my_cpu_%:', n['my_cpu_%'],\
                'sys_cpu_%:', n['sys_cpu_%'],\
                'system_load:', n['system_load'],\
                'tot_mem_bytes: {:,}'.format(totMemBytes),\
                'max_mem_bytes: {:,}'.format(maxMemBytes),\
                'free_mem_bytes: {:,}'.format(freeMemBytes),\
                'usedMemBytes: {:,}'.format(usedMemBytes)

            decrease = round((0.0 + lastUsedMemBytes[i] - usedMemBytes) / lastUsedMemBytes[i], 3)
            if decrease > .05:
                print
                print "\nProbably GC at Node {:}: usedMemBytes decreased by {:f} pct.. {:,} {:,}".format(i, 100 * decrease, lastUsedMemBytes[i], usedMemBytes)
                lastUsedMemBytes[i] = usedMemBytes
            # don't update lastUsedMemBytes if we're decreasing
            if usedMemBytes > lastUsedMemBytes[i]:
                lastUsedMemBytes[i] = usedMemBytes
            
            # sum all individual stats
            for stat in n:
                if stat in statSum:
                    try: 
                        statSum[stat] += n[stat]
                    except TypeError:
                        # raise Exception("statSum[stat] should be number %s %s" % (statSum[stat], stat, n[stat]))
                        print "ERROR: statSum[stat] should be number %s %s %s" % (statSum[stat], stat, n[stat])
                        # do nothing
                else:
                    try: 
                        statSum[stat] = n[stat] + 0.0
                    except TypeError:
                        pass # ignore non-numbers

        trials += 1
        if trials%5 == 0:
            check_sandbox_for_errors()

        time.sleep(retryDelaySecs)
        if not h2o_args.no_timeout and ((time.time() - start) > timeoutSecs):
            raise Exception("Timeout while polling in pollStatsWhileBusy: %s seconds" % timeoutSecs)
    

    # now print man 
    print "Did %s polls" % polls
    statMean = {}
    print "Values are summed across all nodes (cloud members), so divide by node count"
    for s in statSum:
        statMean[s] = round((statSum[s] + 0.0) / polls, 2)
        print "per poll mean", s + ':', statMean[s]

    return  statMean
예제 #39
0
def pollWaitJobs(pattern=None, errorIfCancelled=False, timeoutSecs=60, pollTimeoutSecs=60, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=None):
    wait = True
    waitTime = 0
    ignoredJobs = set()
    while (wait):
        a = h2o_nodes.nodes[0].jobs(timeoutSecs=pollTimeoutSecs)
        verboseprint("jobs():", dump_json(a))
        jobs = a['jobs']
        busy = 0
        for j in jobs:
            cancelled = j['status']=='CANCELLED'
            description = j['description']
            key = j['key']
            jobKey = key['name']
            jobKeyType = key['type']

#          "key": {
#            "URL": "/3/Jobs.json/$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2", 
#            "__meta": {
#              "schema_name": "JobKeyV1", 
#              "schema_type": "Key<Job>", 
#              "schema_version": 1
#            }, 
#            "name": "$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2", 
#            "type": "Key<Job>"
#    
            progress = j['progress']
            progress_msg = j['progress_msg']

            # has exception and val?
            start_time = j['start_time']
            end_time = j.get('end_time', None)
            dest = j['dest']
            description = j['description']
            msec = j.get('msec', None)

            # for now, don't ignore any exceptions

            # FIX! what do exceptions look like now?
            if 'exception' in j and j['exception']:
                check_sandbox_for_errors()
                msg = "ERROR: pollWaitJobs found a job with a exception result when it shouldn't have:\n %s" % dump_json(j)
                raise Exception(msg)

            if errorIfCancelled and cancelled:
                check_sandbox_for_errors()
                print ("ERROR: not stopping, but: pollWaitJobs found a cancelled job when it shouldn't have:\n %s" % dump_json(j))
                print ("Continuing so maybe a json response will give more info")
                
            ### verboseprint(j)
            # don't include cancelled jobs here
            elif j['status']!='DONE':
                if not pattern: 
                    # always print progress if busy job (no pattern used
                    print "time:", time.strftime("%I:%M:%S"), "progress:",  progress, dest
                    verboseprint("description:", description, "end_time:", end_time)
                    busy +=1
                    verboseprint("pollWaitJobs: found a busy job, now: %s" % busy)
                else:
                    if (pattern in key) or (pattern in dest) or (pattern in description):
                        ## print "description:", description, "end_time:", end_time
                        busy += 1
                        verboseprint("pollWaitJobs: found a pattern-matched busy job, now %s" % busy)
                        # always print progress if pattern is used and matches
                        print "time:", time.strftime("%I:%M:%S"), "progress:",  progress, dest
                    # we only want to print the warning message once
                    elif key not in ignoredJobs:
                        jobMsg = "%s %s %s" % (key, description, dest)
                        verboseprint(" %s job in progress but we're ignoring it. Doesn't match pattern." % jobMsg)
                        # I guess "key" is supposed to be unique over all time for a job id?
                        ignoredJobs.add(key)

        if stallForNJobs:
            waitFor = stallForNJobs
        else:
            waitFor = 0

        print " %s jobs in progress." % busy, "Waiting until %s in progress." % waitFor
        wait = busy > waitFor
        if not wait:
            break

        ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs")
        if not h2o_args.no_timeout and (wait and waitTime > timeoutSecs):
            print dump_json(jobs)
            raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds")

        sys.stdout.write('.')
        sys.stdout.flush()
        time.sleep(retryDelaySecs)
        waitTime += retryDelaySecs

        # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack)
        # test would pass ['cpu','disk','jstack'] kind of list
        if benchmarkLogging:
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)

        # check the sandbox for stack traces! just like we do when polling normally
        check_sandbox_for_errors()

    patternKeys = []
    for j in jobs:
        # save the destination keys in progress that match pattern (for returning)
        if pattern and pattern in j['dest']:
            patternKeys.append(j['dest'])

    return patternKeys
예제 #40
0
파일: h2o_bc.py 프로젝트: AI-Cdrone/h2o
def build_cloud(node_count=1, base_port=None, hosts=None,
    timeoutSecs=30, retryDelaySecs=1, cleanup=True, rand_shuffle=True,
    conservative=False, create_json=False, clone_cloud=None, 
    init_sandbox=True, usecloud=False, usecloud_size=None, **kwargs):

    # expectedSize is only used if usecloud

    # usecloud can be passed thru build_cloud param, or command line 
    # not in config json though so no build_cloud_with_hosts path.

    # redirect to build_cloud_with_json if a command line arg
    # wants to force a test to ignore it's build_cloud/build_cloud_with_hosts
    # (both come thru here)
    # clone_cloud is just another way to get the effect (maybe ec2 config file thru
    # build_cloud_with_hosts?
    global stdout_wrapped
    if not h2o_args.disable_time_stamp and not stdout_wrapped:
        sys.stdout = OutWrapper(sys.stdout)
        stdout_wrapped = True

    if h2o_args.usecloud or usecloud:
        # for now, just have fixed name in local file.  (think of this as a temp or debug file)
        # eventually we'll pass the json object instead  for speed?
        nodesJsonPathname = "h2o_fc-nodes.json"

    elif h2o_args.clone_cloud_json:
        nodesJsonPathname = h2o_args.clone_cloud_json

    elif clone_cloud:
        nodesJsonPathname = clone_cloud

    else:
        # normal build_cloud() doesn't use
        nodesJsonPathname = None

    # usecloud dominates over all
    if (h2o_args.clone_cloud_json or clone_cloud) or (h2o_args.usecloud or usecloud):
        # then build_cloud_with_json with json object
        # we don't need to specify these defaults, but leave here to show that we can pass
        # I suppose kwargs will have it
        if h2o_args.usecloud:
            ip_port = h2o_args.usecloud
        elif usecloud:
            ip_port = usecloud
        else:
            ip_port = None

        # h2o_args dominates
        if h2o_args.usecloud_size:
            # only used for expected size
            useCloudExpectedSize = h2o_args.usecloud_size
        else: 
            useCloudExpectedSize = usecloud_size

        nodesJsonObject = h2o_fc.find_cloud(ip_port=ip_port,
            expectedSize=useCloudExpectedSize, nodesJsonPathname=nodesJsonPathname, **kwargs)
            # potentially passed in kwargs
            # hdfs_version='cdh4', hdfs_config=None, hdfs_name_node='172.16.1.176', 

        nodeList = build_cloud_with_json(h2o_nodes_json=nodesJsonPathname)
        return nodeList

    # else
    # moved to here from unit_main. so will run with nosetests too!
    # Normally do this.
    # Don't if build_cloud_with_hosts() did and put a flatfile in there already!
    if init_sandbox:
        clean_sandbox()

    log("#*********************************************************************")
    log("Starting new test: " + h2o_args.python_test_name + " at build_cloud() ")
    log("#*********************************************************************")

    # start up h2o to report the java version (once). output to python stdout
    # only do this for regression testing

    # temporarily disable this, to go a little faster
    #    if getpass.getuser() == 'jenkins':
    #        check_h2o_version()

    ports_per_node = 2
    nodeList = []
    # shift the port used to run groups of tests on the same machine at the same time?
    base_port  = get_base_port(base_port)

    try:
        # if no hosts list, use psutil method on local host.
        totalNodes = 0
        # doing this list outside the loops so we can shuffle for better test variation
        # this jvm startup shuffle is independent from the flatfile shuffle
        portList = [base_port + ports_per_node * i for i in range(node_count)]
        if hosts is None:
            # if use_flatfile, we should create it
            # because tests will just call build_cloud with use_flatfile=True
            # best to just create it all the time..may or may not be used
            write_flatfile(node_count=node_count, base_port=base_port)
            hostCount = 1
            if rand_shuffle:
                random.shuffle(portList)
            for p in portList:
                verboseprint("psutil starting node", i)
                newNode = LocalH2O(port=p, node_id=totalNodes, **kwargs)
                nodeList.append(newNode)
                totalNodes += 1
        else:
            # if hosts, the flatfile was created and uploaded to hosts already
            # I guess don't recreate it, don't overwrite the one that was copied beforehand.
            # we don't always use the flatfile (use_flatfile=False)
            # Suppose we could dispatch from the flatfile to match it's contents
            # but sometimes we want to test with a bad/different flatfile then we invoke h2o?
            hostCount = len(hosts)
            hostPortList = []
            for h in hosts:
                for port in portList:
                    hostPortList.append((h, port))
            if rand_shuffle: random.shuffle(hostPortList)
            for (h, p) in hostPortList:
                verboseprint('ssh starting node', totalNodes, 'via', h)
                newNode = h.remote_h2o(port=p, node_id=totalNodes, **kwargs)
                nodeList.append(newNode)
                totalNodes += 1

        verboseprint("Attempting Cloud stabilize of", totalNodes, "nodes on", hostCount, "hosts")
        start = time.time()
        # UPDATE: best to stabilize on the last node!
        stabilize_cloud(nodeList[0], nodeList,
            timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, noSandboxErrorCheck=True)
        verboseprint(len(nodeList), "Last added node stabilized in ", time.time() - start, " secs")
        verboseprint("Built cloud: %d nodes on %d hosts, in %d s" % \
            (len(nodeList), hostCount, (time.time() - start)))
        h2p.red_print("Built cloud:", nodeList[0].java_heap_GB, "GB java heap(s) with",
            len(nodeList), "total nodes")

        # FIX! using "consensus" in node[-1] should mean this is unnecessary?
        # maybe there's a bug. For now do this. long term: don't want?
        # UPDATE: do it for all cases now 2/14/13
        if conservative: # still needed?
            for n in nodeList:
                stabilize_cloud(n, nodeList, timeoutSecs=timeoutSecs, noSandboxErrorCheck=True)

        # this does some extra checking now
        # verifies cloud name too if param is not None
        verify_cloud_size(nodeList, expectedCloudName=nodeList[0].cloud_name)

        # best to check for any errors due to cloud building right away?
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

    except:
        # nodeList might be empty in some exception cases?
        # no shutdown issued first, though
        if cleanup and nodeList:
            for n in nodeList: n.terminate()
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)
        raise

    print len(nodeList), "total jvms in H2O cloud"
    # put the test start message in the h2o log, to create a marker
    nodeList[0].h2o_log_msg()

    if h2o_args.config_json:
        LOG_DIR = get_sandbox_name()
        # like cp -p. Save the config file, to sandbox
        print "Saving the ", h2o_args.config_json, "we used to", LOG_DIR
        shutil.copy(h2o_args.config_json, LOG_DIR + "/" + os.path.basename(h2o_args.config_json))

    # Figure out some stuff about how this test was run
    cs_time = str(datetime.datetime.now())
    cs_cwd = os.getcwd()
    cs_python_cmd_line = "python %s %s" % (h2o_args.python_test_name, h2o_args.python_cmd_args)
    cs_python_test_name = h2o_args.python_test_name
    if h2o_args.config_json:
        cs_config_json = os.path.abspath(h2o_args.config_json)
    else:
        cs_config_json = None
    cs_username = h2o_args.python_username
    cs_ip = h2o_args.python_cmd_ip

    # dump the nodes state to a json file # include enough extra info to have someone
    # rebuild the cloud if a test fails that was using that cloud.
    if create_json:
        q = {
            'cloud_start':
                {
                    'time': cs_time,
                    'cwd': cs_cwd,
                    'python_test_name': cs_python_test_name,
                    'python_cmd_line': cs_python_cmd_line,
                    'config_json': cs_config_json,
                    'username': cs_username,
                    'ip': cs_ip,
                },
            'h2o_nodes': h2o_util.json_repr(nodeList),
        }

        with open('h2o-nodes.json', 'w+') as f:
            f.write(json.dumps(q, indent=4))

    # save it to a local global copy, in case it's needed for tearDown
    h2o_nodes.nodes[:] = nodeList
    return nodeList
예제 #41
0
def build_cloud(node_count=1, base_port=None, hosts=None,
    timeoutSecs=30, retryDelaySecs=1, cleanup=True, rand_shuffle=True,
    conservative=False, create_json=False, clone_cloud=None,
    init_sandbox=True, usecloud=False, usecloud_size=None, **kwargs):

    # expectedSize is only used if usecloud

    # usecloud can be passed thru build_cloud param, or command line
    # not in config json though so no build_cloud_with_hosts path.

    # redirect to build_cloud_with_json if a command line arg
    # wants to force a test to ignore it's build_cloud/build_cloud_with_hosts
    # (both come thru here)
    # clone_cloud is just another way to get the effect (maybe ec2 config file thru
    # build_cloud_with_hosts?
    global stdout_wrapped
    if not h2o_args.disable_time_stamp and not stdout_wrapped:
        sys.stdout = OutWrapper(sys.stdout)
        stdout_wrapped = True

    if h2o_args.usecloud or usecloud:
        # for now, just have fixed name in local file.  (think of this as a temp or debug file)
        # eventually we'll pass the json object instead  for speed?
        nodesJsonPathname = "h2o_fc-nodes.json"

    elif h2o_args.clone_cloud_json:
        nodesJsonPathname = h2o_args.clone_cloud_json

    elif clone_cloud:
        nodesJsonPathname = clone_cloud

    else:
        # normal build_cloud() doesn't use
        nodesJsonPathname = None

    # usecloud dominates over all
    if (h2o_args.clone_cloud_json or clone_cloud) or (h2o_args.usecloud or usecloud):
        # then build_cloud_with_json with json object
        # we don't need to specify these defaults, but leave here to show that we can pass
        # I suppose kwargs will have it
        if h2o_args.usecloud:
            ip_port = h2o_args.usecloud
        elif usecloud:
            ip_port = usecloud
        else:
            ip_port = None

        # h2o_args dominates
        if h2o_args.usecloud_size:
            # only used for expected size
            useCloudExpectedSize = h2o_args.usecloud_size
        else:
            useCloudExpectedSize = usecloud_size

        if (h2o_args.usecloud or usecloud):
            nodesJsonObject = h2o_fc.find_cloud(ip_port=ip_port,
                expectedSize=useCloudExpectedSize, nodesJsonPathname=nodesJsonPathname, **kwargs)
                # potentially passed in kwargs
                # hdfs_version='cdh4', hdfs_config=None, hdfs_name_node='172.16.1.176',
        else:
            if h2o_args.clone_cloud_json:
                nodesJsonPathname = h2o_args.clone_cloud_json
            else:
                nodesJsonPathname = clone_cloud

        nodeList = build_cloud_with_json(h2o_nodes_json=nodesJsonPathname)
        return nodeList

    # else
    # moved to here from unit_main. so will run with nosetests too!
    # Normally do this.
    # Don't if build_cloud_with_hosts() did and put a flatfile in there already!
    if init_sandbox:
        clean_sandbox()

    log("#*********************************************************************")
    log("Starting new test: " + h2o_args.python_test_name + " at build_cloud() ")
    log("#*********************************************************************")

    # start up h2o to report the java version (once). output to python stdout
    # only do this for regression testing

    # temporarily disable this, to go a little faster
    #    if getpass.getuser() == 'jenkins':
    #        check_h2o_version()

    ports_per_node = 2
    nodeList = []
    # shift the port used to run groups of tests on the same machine at the same time?
    base_port  = get_base_port(base_port)

    try:
        # if no hosts list, use psutil method on local host.
        totalNodes = 0
        # doing this list outside the loops so we can shuffle for better test variation
        # this jvm startup shuffle is independent from the flatfile shuffle
        portList = [base_port + ports_per_node * i for i in range(node_count)]
        if hosts is None:
            # if use_flatfile, we should create it
            # because tests will just call build_cloud with use_flatfile=True
            # best to just create it all the time..may or may not be used
            write_flatfile(node_count=node_count, base_port=base_port)
            hostCount = 1
            if rand_shuffle:
                random.shuffle(portList)
            for p in portList:
                verboseprint("psutil starting node", i)
                newNode = LocalH2O(port=p, node_id=totalNodes, **kwargs)
                nodeList.append(newNode)
                totalNodes += 1
        else:
            # if hosts, the flatfile was created and uploaded to hosts already
            # I guess don't recreate it, don't overwrite the one that was copied beforehand.
            # we don't always use the flatfile (use_flatfile=False)
            # Suppose we could dispatch from the flatfile to match it's contents
            # but sometimes we want to test with a bad/different flatfile then we invoke h2o?
            hostCount = len(hosts)
            hostPortList = []
            for h in hosts:
                for port in portList:
                    hostPortList.append((h, port))
            if rand_shuffle: random.shuffle(hostPortList)
            for (h, p) in hostPortList:
                verboseprint('ssh starting node', totalNodes, 'via', h)
                newNode = h.remote_h2o(port=p, node_id=totalNodes, **kwargs)
                nodeList.append(newNode)
                totalNodes += 1

        verboseprint("Attempting Cloud stabilize of", totalNodes, "nodes on", hostCount, "hosts")
        start = time.time()
        # UPDATE: best to stabilize on the last node!
        # FIX! for now, always check sandbox, because h2oddev has TIME_WAIT port problems
        stabilize_cloud(nodeList[0], nodeList,
            timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, noExtraErrorCheck=False)
        stabilizeTime = time.time() - start
        verboseprint(len(nodeList), "Last added node stabilized in ", stabilizeTime, " secs")

        # assume all the heap sizes are the same as zero
        if nodeList[0].java_heap_GB:
            heapSize = str(nodeList[0].java_heap_GB) + " GB"
        elif nodeList[0].java_heap_GB:
            heapSize = str(nodeList[0].java_heap_MB) + " MB"
        else:
            heapSize = "(unknown)"

        h2p.red_print("Built cloud: %s java heap(s) with %d nodes on %d hosts, stabilizing in %d secs" % \
            (heapSize, len(nodeList), hostCount, stabilizeTime))

        # FIX! using "consensus" in node[-1] should mean this is unnecessary?
        # maybe there's a bug. For now do this. long term: don't want?
        # UPDATE: do it for all cases now 2/14/13
        if conservative: # still needed?
            for n in nodeList:
                # FIX! for now, always check sandbox, because h2oddev has TIME_WAIT port problems
                stabilize_cloud(n, nodeList, timeoutSecs=timeoutSecs, noExtraErrorCheck=False)

        # this does some extra checking now
        # verifies cloud name too if param is not None
        verify_cloud_size(nodeList, expectedCloudName=nodeList[0].cloud_name, expectedLocked=0)

        # FIX! should probably check that the cloud's lock=0. It will go to 1 later.
        # but if it's an existing cloud, it may already be locked.
        # That will be in build_cloud_with_json, though

        # best to check for any errors due to cloud building right away?
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)

        # put the test start message in the h2o log, to create a marker
        nodeList[0].h2o_log_msg()

    except:
        # nodeList might be empty in some exception cases?
        # no shutdown issued first, though
        if cleanup and nodeList:
            for n in nodeList: n.terminate()
        check_sandbox_for_errors(python_test_name=h2o_args.python_test_name)
        raise

    print len(nodeList), "total jvms in H2O cloud"

    if h2o_args.config_json:
        # like cp -p. Save the config file, to sandbox
        print "Saving the ", h2o_args.config_json, "we used to", LOG_DIR
        shutil.copy(h2o_args.config_json, LOG_DIR + "/" + os.path.basename(h2o_args.config_json))

    if create_json:
        # Figure out some stuff about how this test was run
        cs_time = str(datetime.datetime.now())
        cs_cwd = os.getcwd()
        cs_python_cmd_line = "python %s %s" % (h2o_args.python_test_name, h2o_args.python_cmd_args)
        cs_python_test_name = h2o_args.python_test_name
        if h2o_args.config_json:
            cs_config_json = os.path.abspath(h2o_args.config_json)
        else:
            cs_config_json = None
        cs_username = h2o_args.python_username
        cs_ip = h2o_args.python_cmd_ip

        # dump the nodes state to a json file # include enough extra info to have someone
        # rebuild the cloud if a test fails that was using that cloud.
        q = {
            'cloud_start':
                {
                    'time': cs_time,
                    'cwd': cs_cwd,
                    'python_test_name': cs_python_test_name,
                    'python_cmd_line': cs_python_cmd_line,
                    'config_json': cs_config_json,
                    'username': cs_username,
                    'ip': cs_ip,
                },
            'h2o_nodes': h2o_util.json_repr(nodeList),
        }

        with open('h2o-nodes.json', 'w+') as f:
            f.write(json.dumps(q, indent=4))

    # save it to a local global copy, in case it's needed for tearDown
    h2o_nodes.nodes[:] = nodeList
    return nodeList
예제 #42
0
    def do_json_request(self,
                        jsonRequest=None,
                        fullUrl=None,
                        timeout=10,
                        params=None,
                        returnFast=False,
                        cmd='get',
                        extraComment=None,
                        ignoreH2oError=False,
                        noExtraErrorCheck=False,
                        **kwargs):
        # if url param is used, use it as full url. otherwise crate from the jsonRequest
        if fullUrl:
            url = fullUrl
        else:
            url = self.url(jsonRequest)

        # remove any params that are 'None'
        # need to copy dictionary, since can't delete while iterating
        if params is not None:
            params2 = params.copy()
            for k in params2:
                if params2[k] is None:
                    del params[k]
            paramsStr = '?' + '&'.join(
                ['%s=%s' % (k, v) for (k, v) in params.items()])
        else:
            paramsStr = ''

        if extraComment:
            log('Start ' + url + paramsStr, comment=extraComment)
        else:
            log('Start ' + url + paramsStr)

        log_rest("")
        log_rest(
            "----------------------------------------------------------------------\n"
        )
        if extraComment:
            log_rest("# Extra comment info about this request: " +
                     extraComment)
        if cmd == 'get':
            log_rest("GET")
        else:
            log_rest("POST")
        log_rest(url + paramsStr)

        # file get passed thru kwargs here
        try:
            if cmd == 'post':
                r = requests.post(url,
                                  timeout=timeout,
                                  params=params,
                                  **kwargs)
            else:
                r = requests.get(url, timeout=timeout, params=params, **kwargs)

        except Exception, e:
            # rethrow the exception after we've checked for stack trace from h2o
            # out of memory errors maybe don't show up right away? so we should wait for h2o
            # to get it out to h2o stdout. We don't want to rely on cloud teardown to check
            # because there's no delay, and we don't want to delay all cloud teardowns by waiting.
            exc_info = sys.exc_info()
            # use this to ignore the initial connection errors during build cloud when h2o is coming up
            if not noExtraErrorCheck:
                h2p.red_print(
                    "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.."
                    % (url + paramsStr))
                time.sleep(2)
                check_sandbox_for_errors(
                    python_test_name=h2o_args.python_test_name)
            log_rest("")
            log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message))
            raise exc_info[1], None, exc_info[2]
예제 #43
0
def simpleCheckKMeans(self, modelResult, parameters, numRows, numCols, labels):
    # labels should have the ignored columns removed
    # numCols should be decremented by the ignored columns
    # the names order should then match the labels order

    output = modelResult['models'][0]['output']
    # print "model output:", dump_json(output)
    # find out what results we get
    ko = KMeansOutput(output)
    if 1==0:
        for attr, value in ko.__dict__.iteritems():
            # create some python prints to use
            print "%s = ko.%s # %s" % (attr, attr, value)

    # these should sum to the rows in the dataset
    rows = ko.rows # [78, 5, 41, 76]
    model_category = ko.model_category # Clustering
    iters = ko.iters # 11.0
    schema_version = ko.schema_version # 2
    domains = ko.domains # [None, None, None, None, None, None, None, None, None, None, None, None, None, None]
    # 
    names = ko.names # [u'STR', u'OBS', u'AGMT', u'FNDX', u'HIGD', u'DEG', u'CHK', u'AGP1', u'AGMN', u'NLV', u'LIV', u'WT', u'AGLP', u'MST']
    schema_name = ko.schema_name # KMeansModelOutputV2
    schema_type = ko.schema_type # KMeansOutput
    ncats = ko.ncats # 0
    clusters = ko.clusters # [ 4 lists of centers ]
    mse = ko.mse # 505.632581773
    mses = ko.mses # [476.37866653867707, 563.7343365736649, 606.3007046232348, 477.5260498976912]

    if numRows:
        assert numRows==sum(rows)

    if 'K' in parameters:
        K = parameters['K']
        assert len(mses) == K
        assert len(clusters) == K
        assert len(rows) == K

    if numCols:
        assert len(names) == numCols, \
            "Need to pass correct numCols after ignored columns decrement %s %s" % (len(names), numCols)
        for c in clusters:
            assert len(c) == numCols, "%s %s" % (len(c), numCols)

    # this should be true 
    if labels:
        assert len(labels) == numCols, \
            "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
        assert len(labels) == len(names), \
            "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
        assert labels == names

    if 'max_iters' in parameters:
        max_iters = parameters['max_iters']
        assert max_iters >= iters

    # we could check the centers are within the min/max of each column
    for i,c in enumerate(clusters):
        for n in c:
            if math.isnan(float(n)):
                raise Exception("cluster", i, "has NaN:", n, "center:", c)

    # shouldn't have any errors
    check_sandbox_for_errors()

    # create a tuple for each cluster result, then sort by rows for easy comparison
    # maybe should sort by centers?
    # put a cluster index in there too, (leftmost) so we don't lose teack
    tuples = zip(range(len(clusters)), mses, rows, clusters)
    tuplesSorted = sorted(tuples, key=itemgetter(3))

    # undo for printing what the caller will see
    ids, mses, rows, clusters = zip(*tuplesSorted)

    print "\nmse:", mse
    print "iters:", iters
    print "ids:", ids
    print "mses:", mses
    print "rows:", rows
    for i,c in enumerate(clusters):
        print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)

    
    # to unzip the tuplesSorted. zip with *
    # ids, mses, rows, clusters = zip(*tuplesSorted)
    return tuplesSorted, iters, mse, names
예제 #44
0
    def __init__(self,
                 kmeansResult,
                 parameters,
                 numRows,
                 numCols,
                 labels,
                 noPrint=False,
                 **kwargs):
        super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'],
                                        "KMeans",
                                        noPrint=noPrint)

        print self.withinss  # per cluster
        print self.totss
        print self.tot_withinss
        print self.betweenss

        # should model builder add this to the kmeansResult?
        if 'python_elapsed' in kmeansResult:
            self.python_elapsed = kmeansResult['python_elapsed']

        size = self.size  # [78, 5, 41, 76]
        model_category = self.model_category  # Clustering
        iterations = self.iterations  # 11.0
        domains = self.domains
        names = self.names
        categorical_column_count = self.categorical_column_count  # 0
        centers_data = self.centers.data  # [ 4 lists of centers ]
        # h2o returns it sliced across centers. transpose the list of lists, drop 0 which is the cluster id?
        # gotta turn the strings into numbers
        centersStr = [list(x) for x in zip(*centers_data[1:])]
        centers = [map(float, c) for c in centersStr]

        withinss = self.withinss
        totss = self.totss

        if numRows:
            assert numRows == sum(size)

        if 'k' in parameters:
            k = parameters['k']
            assert len(centers) == k
            assert len(size) == k

        if numCols:
            assert len(names) == numCols, \
                "Need to pass correct numCols after ignored columns decrement %s %s %s" % (len(names), numCols, names)
            for c in centers:
                assert len(c) == numCols, "%s %s" % (len(c), numCols)

        # this should be true
        if labels:
            assert len(labels) == numCols, \
                "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
            assert len(labels) == len(names), \
                "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
            assert labels == names

        if 'max_iterations' in parameters:
            max_iterations = parameters['max_iterations']
            assert max_iterations >= iterations

        # we could check the centers are within the min/max of each column
        for i, c in enumerate(centers):
            for n in c:
                if math.isnan(float(n)):
                    raise Exception("cluster", i, "has NaN:", n, "center:", c)

        # create a tuple for each cluster result, then sort by rows for easy comparison
        # maybe should sort by centers?
        # put a cluster index in there too, (leftmost) so we don't lose track
        tuples = zip(range(len(centers)), centers, size, withinss)
        # print "tuples:", dump_json(tuples)
        # can we sort on the sum of the centers?
        self.tuplesSorted = sorted(tuples, key=lambda tup: sum(tup[1]))

        print "iterations:", iterations
        # undo for printing what the caller will see
        ids, centers, size, withinss = zip(*self.tuplesSorted)
        for i, c in enumerate(centers):
            print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)
            print "rows_per_cluster[%s]: " % i, size[i]
            print "withinss[%s]: " % i, withinss[i]
            print "size[%s]:" % i, size[i]

        print "KMeansObj created for:", "???"  # vars(self)

        # shouldn't have any errors
        check_sandbox_for_errors()
예제 #45
0
def simpleCheckGLM(
    self,
    model,
    parameters,
    labelList,
    labelListUsed,
    allowFailWarning=False,
    allowZeroCoeff=False,
    prettyPrint=False,
    noPrint=False,
    maxExpectedIterations=None,
    doNormalized=False,
    allowNaN=False,
):

    # FIX! the structure is all different
    return

    warnings = ""
    # binomial = model.binomial
    residual_deviance = model.training_metrics.residual_deviance

    threshold = model.training_metrics.threshold
    check_obj_has_good_numbers(threshold, "threshold", allowNaN=allowNaN)

    auc = model.AUC
    # NaN if not logistic
    # check_obj_has_good_numbers(auc, 'model.AUC')

    best_lambda_idx = model.best_lambda_idx
    model_category = model.model_category
    name = model.name
    residual_degrees_of_freedom = model.residual_degrees_of_freedom

    # is this no longer used?
    coefficients_magnitude = model.coefficients_magnitude

    null_deviance = model.null_deviance
    check_obj_has_good_numbers(null_deviance, "model.null_deviance", allowNaN=allowNaN)

    null_degrees_of_freedom = model.null_degrees_of_freedom
    check_obj_has_good_numbers(null_degrees_of_freedom, "model.null_degrees_of_freedom", allowNaN=allowNaN)

    domains = model.domains

    # when is is this okay to be NaN?
    AIC = model.AIC
    check_obj_has_good_numbers(AIC, "model.AIC", allowNaN=allowNaN)

    names = model.names

    coeffs_names = model.coefficients_table.data[0]

    # these are returned as quoted strings. Turn them into numbers
    temp = model.coefficients_table.data[1]
    assert len(coeffs_names) == len(temp), "%s %s" % (len(coeffs_names), len(temp))

    # we need coefficients to be floats or empty
    check_obj_has_good_numbers(temp, "model.coeffs", allowNaN=False)
    # print "temp", temp[0:10]
    # print "temp[5489:5500]", temp[5489:5500]

    # UPDATE: None (null json) is legal for coeffs
    coeffs = map(lambda x: float(x) if (x is not None and str(x) != "") else 0, temp)

    intercept = coeffs[-1]
    interceptName = coeffs_names[-1]
    assert interceptName == "Intercept"

    assert len(coeffs) == len(coeffs_names), "%s %s" % (len(coeffs), len(coeffs_names))
    # FIX! if a coeff is zeroed/ignored, it doesn't show up?
    # get rid of intercept in glm response
    # assert (len(coeffs)-1) == len(labelListUsed, \
    #    "%s %s %s %s" % (len(coeffs), len(labelListUsed), coeffs, labelListUsed)

    # labelList still has the response column?
    # ignored columns aren't in model.names, but output response is.
    # labelListUsed has the response col removed so add 1

    # Hmm..dropped coefficients again? can't do this check?
    # assert len(model.names) == len(labelListUsed), \
    #    "%s %s %s %s" % (len(model.names), len(labelListUsed), model.names, labelList)

    # this is no longer true!
    # assert model.threshold!=0

    print "len(coeffs)", len(coeffs)
    print "coeffs:", coeffs

    # last one is intercept
    if interceptName != "Intercept" or abs(intercept) < 1e-26:
        raise Exception("'Intercept' should be last in coeffs_names %s %s" % (interceptName, intercept))

    y = parameters["response_column"]

    cString = "\n"
    for i, c in enumerate(coeffs_names):
        cString += "%s: %.5e   " % (coeffs_names[i], coeffs[i])

    print cString
    print "\nH2O intercept:\t\t%.5e" % intercept
    print "\nTotal # of coeffs:", len(coeffs_names)

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(
        absIntercept,
        1e-26,
        (
            "abs. value of GLM coeffs['Intercept'] is "
            + str(absIntercept)
            + ", not >= 1e-26 for Intercept"
            + "\n"
            + "parameters:"
            + dump_json(parameters)
        ),
    )

    if (not allowZeroCoeff) and (len(coeffs) > 1):
        s = 0.0
        for c in coeffs:
            s += abs(float(c))

        self.assertGreater(
            s,
            1e-26,
            (
                "sum of abs. value of GLM coeffs/intercept is "
                + str(s)
                + ", not >= 1e-26\n"
                + "parameters:"
                + dump_json(parameters)
            ),
        )

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, coeffs, intercept)
예제 #46
0
def simpleCheckGLM(self,
                   model,
                   parameters,
                   labelList,
                   labelListUsed,
                   allowFailWarning=False,
                   allowZeroCoeff=False,
                   prettyPrint=False,
                   noPrint=False,
                   maxExpectedIterations=None,
                   doNormalized=False):

    warnings = ''

    intercept = model.global_beta[-1]
    interceptName = model.coefficient_names[-1]

    coeffs = model.global_beta[:-1]
    coeffs_names = model.coefficient_names[:-1]

    assert len(coeffs) == (len(model.coefficient_names) - 1)
    assert len(coeffs) == len(labelListUsed), "%s %s" % (coeffs, labelListUsed)

    # labelList still has the response column?
    # ignored columns aren't in model.names, but output response is.
    # labelListUsed has the response col removed so add 1
    assert len(model.names) == (len(labelListUsed) +
                                1), "%s %s" % (model.names, labelList)
    assert model.threshold != 0

    print "len(coeffs)", len(coeffs)
    print "coeffs:", coeffs

    # last one is intercept
    if interceptName != "Intercept" or abs(intercept) < 1e-26:
        raise Exception(
            "'Intercept' should be last in coefficient_names and global_beta %s %s"
            % (interceptName, intercept))

    y = parameters['response_column']

    cString = "\n"
    for i, c in enumerate(coeffs_names):
        cString += "%s: %.5e   " % (coeffs_names[i], coeffs[i])

    print cString
    print "\nH2O intercept:\t\t%.5e" % intercept
    print "\nTotal # of coeffs:", len(coeffs_names)

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26,
                       ("abs. value of GLM coeffs['Intercept'] is " +
                        str(absIntercept) + ", not >= 1e-26 for Intercept" +
                        "\n" + "parameters:" + dump_json(parameters)))

    if (not allowZeroCoeff) and (len(coeffs) > 1):
        s = 0.0
        for c in coeffs:
            s += abs(float(c))

        self.assertGreater(
            s, 1e-26,
            ("sum of abs. value of GLM coeffs/intercept is " + str(s) +
             ", not >= 1e-26\n" + "parameters:" + dump_json(parameters)))

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, coeffs, intercept)
예제 #47
0
파일: h2o_glm.py 프로젝트: JMR-b/h2o-dev
def oldSimpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors. 
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    GLMModel = glm['glm_model']
    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" % dump_json(glm))

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    GLMParams = GLMModel['glm']
    family = GLMParams["family"]

    # number of submodels = number of lambda
    # min of 2. lambda_max is first
    submodels = GLMModel['submodels']
    # since all our tests?? only use one lambda, the best_lamda_idx should = 1
    best_lambda_idx = GLMModel['best_lambda_idx']
    print "best_lambda_idx:", best_lambda_idx
    lambda_max = GLMModel['lambda_max']
    print "lambda_max:", lambda_max

    # currently lambda_max is not set by tomas. ..i.e.not valid
    if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value):
        raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value))

    # submodels0 = submodels[0]
    # submodels1 = submodels[-1] # hackery to make it work when there's just one

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels)))

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels)))

    submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one
    iterations = submodels1['iteration']


    print "GLMModel/iterations:", iterations

            # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations  > maxExpectedIterations:
            raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )

    if 'validation' not in submodels1:
        raise Exception("Should be a 'validation' key in submodels1: %s" % dump_json(submodels1))
    validationsList = submodels1['validation']
    validations = validationsList
        
    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    print "GLMModel/validations"        
    validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
    validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])        
    print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
    print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        best_threshold = validations['best_threshold']
        thresholds = validations['thresholds']
        print "%15s %s" % ("best_threshold:\t", best_threshold)

        # have to look up the index for the cm, from the thresholds list
        best_index = None

        for i,t in enumerate(thresholds):
            if t >= best_threshold: # ends up using next one if not present
                best_index = i
                break
            
        assert best_index!=None, "%s %s" % (best_threshold, thresholds)
        print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

        # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
        submodels = glm['glm_model']['submodels']
        # FIX! this isn't right if we have multiple lambdas? different submodels?
        cms = submodels[0]['validation']['_cms']
        self.assertEqual(len(thresholds), len(cms), 
            msg="thresholds %s and cm %s should be lists of the same size. %s" % (len(thresholds), len(cms), thresholds))
        # FIX! best_threshold isn't necessarily in the list. jump out if >=
        assert best_index<len(cms), "%s %s" % (best_index, len(cms))
        # if we want 0.5..rounds to int
        # mid = len(cms)/2
        # cm = cms[mid]
        cm = cms[best_index]

        print "cm:", dump_json(cm['_arr'])
        predErr = cm['_predErr']
        classErr = cm['_classErr']
        # compare to predErr
        # pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
        # FIX!
        pctWrong = 0
        print "predErr:", predErr
        print "calculated pctWrong from cm:", pctWrong
        print "classErr:", classErr

        # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

        print "\nTrain\n==========\n"
        # print h2o_gbm.pp_cm(cm['_arr'])


    if family=="poisson" or family=="gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    coefficients_names = GLMModel['coefficients_names']
    # print "coefficients_names:", coefficients_names
    idxs = submodels1['idxs']
    print "idxs:", idxs
    coefficients_names = coefficients_names

    # always check both normalized and normal coefficients
    norm_beta = submodels1['norm_beta']
    # if norm_beta and len(coefficients_names)!=len(norm_beta):
    #    print len(coefficients_names), len(norm_beta)
    #    raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta))
#
    beta = submodels1['beta']
    # print "beta:", beta
    # if len(coefficients_names)!=len(beta):
    #    print len(coefficients_names), len(beta)
    #    raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta))


    # test wants to use normalized?
    if doNormalized:
        beta_used = norm_beta
    else:
        beta_used = beta

    coefficients = {}
    # create a dictionary with name, beta (including intercept) just like v1

    for i,b in zip(idxs, beta_used[:-1]):
        name = coefficients_names[i]
        coefficients[name] = b

    print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
    print  "coefficients:", coefficients
    print  "beta:", beta
    print  "norm_beta:", norm_beta

    coefficients['Intercept'] = beta_used[-1]
    print "len(coefficients_names)", len(coefficients_names)
    print "len(idxs)", len(idxs)
    print "idxs[-1]", idxs[-1]
    print "intercept demapping info:", \
        "coefficients_names[-i]:", coefficients_names[-1], \
        "idxs[-1]:", idxs[-1], \
        "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
        "beta_used[-1]:", beta_used[-1], \
        "coefficients['Intercept']", coefficients['Intercept']

    # last one is intercept
    interceptName = coefficients_names[idxs[-1]]
    if interceptName != "Intercept" or abs(beta_used[-1])<1e-26:
        raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\
            (idxs[-1], beta_used[-1], "-"+interceptName+"-"))

    # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names
    # new 5/28/14. glm can point to zero coefficients
    # for i in idxs:
    #     if beta_used[i]==0.0:
    ##        raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
    if len(idxs) > len(beta_used):
        raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used)))
    intercept = coefficients.pop('Intercept', None)

    # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
    # the last one shoudl be 'Intercept' ?
    coefficients_names.pop()

    # have to skip the output col! get it from kwargs
    # better always be there!
    y = kwargs['response']

    # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names'
    # from the response
    # Tomas created 'coefficients_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before
    # he gets it? 
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint: 
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # coefficients_names is input only now..same for header or no header, or expanded enums
    for c in coefficients_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint: 
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(coefficients_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        # add kwargs to help debug without looking at console log
        self.assertGreater(absXCoeff, 1e-26, (
            "abs. value of GLM coefficients['" + str(colX) + "'] is " +
            str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) +  "\n" +
            "kwargs:" + dump_json(kwargs)
            ))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coefficients['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" +
        "kwargs:" + dump_json(kwargs)
        ))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients)>0):
        maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey]
        minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey]
    else: 
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero, 
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients)>1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26\n" +
            "kwargs:" + dump_json(kwargs)
            ))

    print "submodels1, run_time (milliseconds):", submodels1['run_time']

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, cList, intercept)
예제 #48
0
def pollWaitJobs(pattern=None, errorIfCancelled=False, timeoutSecs=60, pollTimeoutSecs=60, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=None):
    wait = True
    waitTime = 0
    ignoredJobs = set()
    while (wait):
        a = h2o_nodes.nodes[0].jobs(timeoutSecs=pollTimeoutSecs)
        verboseprint("jobs():", dump_json(a))
        jobs = a['jobs']
        busy = 0
        for j in jobs:
            cancelled = j['cancelled'] or (j['result'].get('val', None)=='CANCELLED')
            description = j['description']
            destination_key = j['destination_key']
            end_time = j['end_time']
            key = j['key']
            progress = j['progress']
            # has exception and val?
            result = j['result']
            start_time = j['start_time']

            # for now, don't ignore any exceptions
            if 'exception' in result and result['exception']:
                check_sandbox_for_errors()
                msg = "ERROR: pollWaitJobs found a job with a exception result when it shouldn't have:\n %s" % dump_json(j)
                raise Exception(msg)

            if result:
                # ignore if 'val' is 'OK'
                if 'val' in result and result['val'] == 'OK':
                    pass
                else:
                    print "non-empty result: %s for %s" % (result, key)

            if errorIfCancelled and cancelled:
                check_sandbox_for_errors()
                print ("ERROR: not stopping, but: pollWaitJobs found a cancelled job when it shouldn't have:\n %s" % dump_json(j))
                print ("Continuing so maybe a json response will give more info")
                
            ### verboseprint(j)
            # don't include cancelled jobs here
            elif end_time=='' and not cancelled:
                if not pattern: 
                    # always print progress if busy job (no pattern used
                    print "time:", time.strftime("%I:%M:%S"), "progress:",  progress, destination_key
                    verboseprint("description:", description, "end_time:", end_time)
                    busy +=1
                    verboseprint("pollWaitJobs: found a busy job, now: %s" % busy)
                else:
                    if (pattern in key) or (pattern in destination_key) or (pattern in description):
                        ## print "description:", description, "end_time:", end_time
                        busy += 1
                        verboseprint("pollWaitJobs: found a pattern-matched busy job, now %s" % busy)
                        # always print progress if pattern is used and matches
                        print "time:", time.strftime("%I:%M:%S"), "progress:",  progress, destination_key
                    # we only want to print the warning message once
                    elif key not in ignoredJobs:
                        jobMsg = "%s %s %s" % (key, description, destination_key)
                        verboseprint(" %s job in progress but we're ignoring it. Doesn't match pattern." % jobMsg)
                        # I guess "key" is supposed to be unique over all time for a job id?
                        ignoredJobs.add(key)

        if stallForNJobs:
            waitFor = stallForNJobs
        else:
            waitFor = 0

        print " %s jobs in progress." % busy, "Waiting until %s in progress." % waitFor
        wait = busy > waitFor
        if not wait:
            break

        ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs")
        if not h2o_args.no_timeout and (wait and waitTime > timeoutSecs):
            print dump_json(jobs)
            raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds")

        sys.stdout.write('.')
        sys.stdout.flush()
        time.sleep(retryDelaySecs)
        waitTime += retryDelaySecs

        # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack)
        # test would pass ['cpu','disk','jstack'] kind of list
        if benchmarkLogging:
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)

        # check the sandbox for stack traces! just like we do when polling normally
        check_sandbox_for_errors()

    patternKeys = []
    for j in jobs:
        # save the destination keys in progress that match pattern (for returning)
        if pattern and pattern in j['destination_key']:
            patternKeys.append(j['destination_key'])

    return patternKeys
예제 #49
0
def simpleCheckGBMView(node=None, gbmv=None, noPrint=False, **kwargs):
    if not node:
        node = h2o_nodes.nodes[0]

    if 'warnings' in gbmv:
        warnings = gbmv['warnings']
        # catch the 'Failed to converge" for now
        for w in warnings:
            if not noPrint: print "\nwarning:", w
            if ('Failed' in w) or ('failed' in w):
                raise Exception(w)

    if 'cm' in gbmv:
        cm = gbmv['cm'] # only one
    else:
        if 'gbm_model' in gbmv:
            gbm_model = gbmv['gbm_model']
        else:
            raise Exception("no gbm_model in gbmv? %s" % dump_json(gbmv))

        cms = gbm_model['cms']
        print "number of cms:", len(cms)
        print "FIX! need to add reporting of h2o's _perr per class error"
        # FIX! what if regression. is rf only classification?
        print "cms[-1]['_arr']:", cms[-1]['_arr']
        print "cms[-1]['_predErr']:", cms[-1]['_predErr']
        print "cms[-1]['_classErr']:", cms[-1]['_classErr']

        ## print "cms[-1]:", dump_json(cms[-1])
        ## for i,c in enumerate(cms):
        ##    print "cm %s: %s" % (i, c['_arr'])

        cm = cms[-1]['_arr'] # take the last one

    scoresList = cm

    used_trees = gbm_model['N']
    errs = gbm_model['errs']
    print "errs[0]:", errs[0]
    print "errs[-1]:", errs[-1]
    print "errs:", errs
    # if we got the ntree for comparison. Not always there in kwargs though!
    param_ntrees = kwargs.get('ntrees',None)
    if (param_ntrees is not None and used_trees != param_ntrees):
        raise Exception("used_trees should == param_ntree. used_trees: %s"  % used_trees)
    if (used_trees+1)!=len(cms) or (used_trees+1)!=len(errs):
        raise Exception("len(cms): %s and len(errs): %s should be one more than N %s trees" % (len(cms), len(errs), used_trees))

    totalScores = 0
    totalRight = 0
    # individual scores can be all 0 if nothing for that output class
    # due to sampling
    classErrorPctList = []
    predictedClassDict = {} # may be missing some? so need a dict?
    for classIndex,s in enumerate(scoresList):
        classSum = sum(s)
        if classSum == 0 :
            # why would the number of scores for a class be 0? does GBM CM have entries for non-existent classes
            # in a range??..in any case, tolerate. (it shows up in test.py on poker100)
            if not noPrint: print "class:", classIndex, "classSum", classSum, "<- why 0?"
        else:
            # H2O should really give me this since it's in the browser, but it doesn't
            classRightPct = ((s[classIndex] + 0.0)/classSum) * 100
            totalRight += s[classIndex]
            classErrorPct = round(100 - classRightPct, 2)
            classErrorPctList.append(classErrorPct)
            ### print "s:", s, "classIndex:", classIndex
            if not noPrint: print "class:", classIndex, "classSum", classSum, "classErrorPct:", "%4.2f" % classErrorPct

            # gather info for prediction summary
            for pIndex,p in enumerate(s):
                if pIndex not in predictedClassDict:
                    predictedClassDict[pIndex] = p
                else:
                    predictedClassDict[pIndex] += p

        totalScores += classSum

    #****************************
    if not noPrint: 
        print "Predicted summary:"
        # FIX! Not sure why we weren't working with a list..hack with dict for now
        for predictedClass,p in predictedClassDict.items():
            print str(predictedClass)+":", p

        # this should equal the num rows in the dataset if full scoring? (minus any NAs)
        print "totalScores:", totalScores
        print "totalRight:", totalRight
        if totalScores != 0:  
            pctRight = 100.0 * totalRight/totalScores
        else: 
            pctRight = 0.0
        pctWrong = 100 - pctRight
        print "pctRight:", "%5.2f" % pctRight
        print "pctWrong:", "%5.2f" % pctWrong

    #****************************
    # more testing for GBMView
    # it's legal to get 0's for oobe error # if sample_rate = 1

    sample_rate = kwargs.get('sample_rate', None)
    validation = kwargs.get('validation', None)
    if (sample_rate==1 and not validation): 
        pass
    elif (totalScores<=0 or totalScores>5e9):
        raise Exception("scores in GBMView seems wrong. scores:", scoresList)

    varimp = gbm_model['varimp']
    treeStats = gbm_model['treeStats']
    if not treeStats:
        raise Exception("treeStats not right?: %s" % dump_json(treeStats))
    # print "json:", dump_json(gbmv)
    data_key = gbm_model['_dataKey']
    model_key = gbm_model['_key']
    classification_error = pctWrong

    if not noPrint: 
        if 'minLeaves' not in treeStats or not treeStats['minLeaves']:
            raise Exception("treeStats seems to be missing minLeaves %s" % dump_json(treeStats))
        print """
         Leaves: {0} / {1} / {2}
          Depth: {3} / {4} / {5}
            Err: {6:0.2f} %
        """.format(
                treeStats['minLeaves'],
                treeStats['meanLeaves'],
                treeStats['maxLeaves'],
                treeStats['minDepth'],
                treeStats['meanDepth'],
                treeStats['maxDepth'],
                classification_error,
                )
    
    ### modelInspect = node.inspect(model_key)
    dataInspect = h2o_cmd.runInspect(key=data_key)
    check_sandbox_for_errors()
    return (round(classification_error,2), classErrorPctList, totalScores)
예제 #50
0
def pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=5):
    busy = True
    trials = 0

    start = time.time()
    polls = 0
    statSum = {}
    # just init for worst case 64 nodes?
    lastUsedMemBytes = [1 for i in range(64)]
    while busy:
        polls += 1
        # get utilization and print it
        # any busy jobs
        a = h2o_nodes.nodes[0].jobs(timeoutSecs=60)
        busy = False
        for j in a['jobs']:
            if j['end_time']=='' and not (j['cancelled'] or (j['result'].get('val', None)=='CANCELLED')):
                busy = True
                verboseprint("Still busy")
                break

        cloudStatus = h2o_nodes.nodes[0].get_cloud(timeoutSecs=timeoutSecs)
        nodes = cloudStatus['nodes']
        for i,n in enumerate(nodes):

            # check for drop in tot_mem_bytes, and report as "probably post GC"
            totMemBytes = n['tot_mem_bytes']
            maxMemBytes = n['max_mem_bytes']
            freeMemBytes = n['free_mem_bytes']

            usedMemBytes = totMemBytes - freeMemBytes
            availMemBytes = maxMemBytes - usedMemBytes
            print 'Node %s:' % i, \
                'num_cpus:', n['num_cpus'],\
                'my_cpu_%:', n['my_cpu_%'],\
                'sys_cpu_%:', n['sys_cpu_%'],\
                'system_load:', n['system_load'],\
                'tot_mem_bytes: {:,}'.format(totMemBytes),\
                'max_mem_bytes: {:,}'.format(maxMemBytes),\
                'free_mem_bytes: {:,}'.format(freeMemBytes),\
                'usedMemBytes: {:,}'.format(usedMemBytes)

            decrease = round((0.0 + lastUsedMemBytes[i] - usedMemBytes) / lastUsedMemBytes[i], 3)
            if decrease > .05:
                print
                print "\nProbably GC at Node {:}: usedMemBytes decreased by {:f} pct.. {:,} {:,}".format(i, 100 * decrease, lastUsedMemBytes[i], usedMemBytes)
                lastUsedMemBytes[i] = usedMemBytes
            # don't update lastUsedMemBytes if we're decreasing
            if usedMemBytes > lastUsedMemBytes[i]:
                lastUsedMemBytes[i] = usedMemBytes
            
            # sum all individual stats
            for stat in n:
                if stat in statSum:
                    try: 
                        statSum[stat] += n[stat]
                    except TypeError:
                        # raise Exception("statSum[stat] should be number %s %s" % (statSum[stat], stat, n[stat]))
                        print "ERROR: statSum[stat] should be number %s %s %s" % (statSum[stat], stat, n[stat])
                        # do nothing
                else:
                    try: 
                        statSum[stat] = n[stat] + 0.0
                    except TypeError:
                        pass # ignore non-numbers

        trials += 1
        if trials%5 == 0:
            check_sandbox_for_errors()

        time.sleep(retryDelaySecs)
        if not h2o_args.no_timeout and ((time.time() - start) > timeoutSecs):
            raise Exception("Timeout while polling in pollStatsWhileBusy: %s seconds" % timeoutSecs)
    

    # now print man 
    print "Did %s polls" % polls
    statMean = {}
    print "Values are summed across all nodes (cloud members), so divide by node count"
    for s in statSum:
        statMean[s] = round((statSum[s] + 0.0) / polls, 2)
        print "per poll mean", s + ':', statMean[s]

    return  statMean
예제 #51
0
def simpleCheckGLM(self,
                   model,
                   parameters,
                   labelList,
                   labelListUsed,
                   allowFailWarning=False,
                   allowZeroCoeff=False,
                   prettyPrint=False,
                   noPrint=False,
                   maxExpectedIterations=None,
                   doNormalized=False,
                   allowNaN=False):

    warnings = ''
    rank = model.rank
    binomial = model.binomial
    residual_deviance = model.residual_deviance

    threshold = model.threshold
    check_obj_has_good_numbers(threshold, 'threshold', allowNaN=allowNaN)

    auc = model.AUC
    # NaN if not logistic
    # check_obj_has_good_numbers(auc, 'model.AUC')

    best_lambda_idx = model.best_lambda_idx
    model_category = model.model_category
    name = model.name
    residual_degrees_of_freedom = model.residual_degrees_of_freedom

    # is this no longer used?
    coefficients_magnitude = model.coefficients_magnitude

    null_deviance = model.null_deviance
    check_obj_has_good_numbers(null_deviance,
                               'model.null_deviance',
                               allowNaN=allowNaN)

    null_degrees_of_freedom = model.null_degrees_of_freedom
    check_obj_has_good_numbers(null_degrees_of_freedom,
                               'model.null_degrees_of_freedom',
                               allowNaN=allowNaN)

    domains = model.domains

    # when is is this okay to be NaN?
    AIC = model.AIC
    check_obj_has_good_numbers(AIC, 'model.AIC', allowNaN=allowNaN)

    names = model.names

    coeffs_names = model.coefficients_table.data[0]

    # these are returned as quoted strings. Turn them into numbers
    temp = model.coefficients_table.data[1]
    assert len(coeffs_names) == len(temp), "%s %s" % (len(coeffs_names),
                                                      len(temp))

    # we need coefficients to be floats or empty
    check_obj_has_good_numbers(temp, 'model.coeffs', allowNaN=False)
    # print "temp", temp[0:10]
    # print "temp[5489:5500]", temp[5489:5500]

    # UPDATE: None (null json) is legal for coeffs
    coeffs = map(lambda x: float(x)
                 if (x is not None and str(x) != "") else 0, temp)

    intercept = coeffs[-1]
    interceptName = coeffs_names[-1]
    assert interceptName == 'Intercept'

    assert len(coeffs) == len(coeffs_names), "%s %s" % (len(coeffs),
                                                        len(coeffs_names))
    # FIX! if a coeff is zeroed/ignored, it doesn't show up?
    # get rid of intercept in glm response
    # assert (len(coeffs)-1) == len(labelListUsed, \
    #    "%s %s %s %s" % (len(coeffs), len(labelListUsed), coeffs, labelListUsed)

    # labelList still has the response column?
    # ignored columns aren't in model.names, but output response is.
    # labelListUsed has the response col removed so add 1

    # Hmm..dropped coefficients again? can't do this check?
    # assert len(model.names) == len(labelListUsed), \
    #    "%s %s %s %s" % (len(model.names), len(labelListUsed), model.names, labelList)

    # this is no longer true!
    # assert model.threshold!=0

    print "len(coeffs)", len(coeffs)
    print "coeffs:", coeffs

    # last one is intercept
    if interceptName != "Intercept" or abs(intercept) < 1e-26:
        raise Exception("'Intercept' should be last in coeffs_names %s %s" %
                        (interceptName, intercept))

    y = parameters['response_column']

    cString = "\n"
    for i, c in enumerate(coeffs_names):
        cString += "%s: %.5e   " % (coeffs_names[i], coeffs[i])

    print cString
    print "\nH2O intercept:\t\t%.5e" % intercept
    print "\nTotal # of coeffs:", len(coeffs_names)

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26,
                       ("abs. value of GLM coeffs['Intercept'] is " +
                        str(absIntercept) + ", not >= 1e-26 for Intercept" +
                        "\n" + "parameters:" + dump_json(parameters)))

    if (not allowZeroCoeff) and (len(coeffs) > 1):
        s = 0.0
        for c in coeffs:
            s += abs(float(c))

        self.assertGreater(
            s, 1e-26,
            ("sum of abs. value of GLM coeffs/intercept is " + str(s) +
             ", not >= 1e-26\n" + "parameters:" + dump_json(parameters)))

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, coeffs, intercept)
예제 #52
0
def simpleCheckRFView(node=None, rfv=None, checkScoringOnly=False, noPrint=False, **kwargs):
    if not node:
        node = h2o_nodes.nodes[0]

    if 'warnings' in rfv:
        warnings = rfv['warnings']
        # catch the 'Failed to converge" for now
        for w in warnings:
            if not noPrint: print "\nwarning:", w
            if ('Failed' in w) or ('failed' in w):
                raise Exception(w)

    #****************************
    # if we are checking after confusion_matrix for predict, the jsonschema is different

    if 'cm' in rfv:
        cm = rfv['cm'] # only one
    else:
        if 'drf_model' in rfv:
            rf_model = rfv['drf_model']
        elif 'speedrf_model' in rfv:
            rf_model = rfv['speedrf_model']
        elif 'rf_model' in rfv:
            rf_model = rfv['rf_model']
        else:
            raise Exception("no rf_model in rfv? %s" % dump_json(rfv))

        cms = rf_model['cms']
        print "number of cms:", len(cms)
        print "FIX! need to add reporting of h2o's _perr per class error"
        # FIX! what if regression. is rf only classification?
        print "cms[-1]['_arr']:", cms[-1]['_arr']
        print "cms[-1]['_predErr']:", cms[-1]['_predErr']
        print "cms[-1]['_classErr']:", cms[-1]['_classErr']

        ## print "cms[-1]:", dump_json(cms[-1])
        ## for i,c in enumerate(cms):
        ##    print "cm %s: %s" % (i, c['_arr'])

        cm = cms[-1]['_arr'] # take the last one

    scoresList = cm

    if not checkScoringOnly:
        used_trees = rf_model['N']
        errs = rf_model['errs']
        print "errs[0]:", errs[0]
        print "errs[-1]:", errs[-1]
        print "errs:", errs
        # if we got the ntree for comparison. Not always there in kwargs though!
        param_ntrees = kwargs.get('ntrees', None)
        if (param_ntrees is not None and used_trees != param_ntrees):
            raise Exception("used_trees should == param_ntree. used_trees: %s"  % used_trees)
        if (used_trees+1)!=len(cms) or (used_trees+1)!=len(errs):
            raise Exception("len(cms): %s and len(errs): %s should be one more than N %s trees" % (len(cms), len(errs), used_trees))


    #****************************
    totalScores = 0
    totalRight = 0
    # individual scores can be all 0 if nothing for that output class
    # due to sampling
    classErrorPctList = []
    predictedClassDict = {} # may be missing some? so need a dict?
    for classIndex,s in enumerate(scoresList):
        classSum = sum(s)
        if classSum == 0 :
            # why would the number of scores for a class be 0? does RF CM have entries for non-existent classes
            # in a range??..in any case, tolerate. (it shows up in test.py on poker100)
            if not noPrint: print "class:", classIndex, "classSum", classSum, "<- why 0?"
        else:
            # H2O should really give me this since it's in the browser, but it doesn't
            classRightPct = ((s[classIndex] + 0.0)/classSum) * 100
            totalRight += s[classIndex]
            classErrorPct = round(100 - classRightPct, 2)
            classErrorPctList.append(classErrorPct)
            ### print "s:", s, "classIndex:", classIndex
            if not noPrint: print "class:", classIndex, "classSum", classSum, "classErrorPct:", "%4.2f" % classErrorPct

            # gather info for prediction summary
            for pIndex,p in enumerate(s):
                if pIndex not in predictedClassDict:
                    predictedClassDict[pIndex] = p
                else:
                    predictedClassDict[pIndex] += p

        totalScores += classSum

    #****************************
    if not noPrint: 
        print "Predicted summary:"
        # FIX! Not sure why we weren't working with a list..hack with dict for now
        for predictedClass,p in predictedClassDict.items():
            print str(predictedClass)+":", p

        # this should equal the num rows in the dataset if full scoring? (minus any NAs)
        print "totalScores:", totalScores
        print "totalRight:", totalRight
        if totalScores != 0:  
            pctRight = 100.0 * totalRight/totalScores
        else: 
            pctRight = 0.0
        pctWrong = 100 - pctRight
        print "pctRight:", "%5.2f" % pctRight
        print "pctWrong:", "%5.2f" % pctWrong

    if checkScoringOnly:
        check_sandbox_for_errors()
        classification_error = pctWrong
        return (round(classification_error,2), classErrorPctList, totalScores)

    # it's legal to get 0's for oobe error # if sample_rate = 1
    sample_rate = kwargs.get('sample_rate', None)
    validation = kwargs.get('validation', None)
    print "kevin:", sample_rate, validation
    if (sample_rate==1 and not validation): 
        pass
    elif (totalScores<=0 or totalScores>5e9):
        raise Exception("scores in RFView seems wrong. scores:", scoresList)

    varimp = rf_model['varimp']

    if 'importance' in kwargs and kwargs['importance']:
        max_var = varimp['max_var']
        variables = varimp['variables']
        varimpSD = varimp['varimpSD']
        varimp2 = varimp['varimp']

        # what is max_var? it's 100 while the length of the others is 54 for covtype
        if not max_var:
            raise Exception("varimp.max_var is None? %s" % max_var)
        # if not variables:
        #     raise Exception("varimp.variables is None? %s" % variables)
        if not varimpSD:
            raise Exception("varimp.varimpSD is None? %s" % varimpSD)
        if not varimp2:
            raise Exception("varimp.varimp is None? %s" % varimp2)

        # check that they all have the same length and that the importance is not all zero
        # if len(varimpSD)!=max_var or len(varimp2)!=max_var or len(variables)!=max_var:
        #    raise Exception("varimp lists seem to be wrong length: %s %s %s" % \
        #        (max_var, len(varimpSD), len(varimp2), len(variables)))

        # not checking maxvar or variables. Don't know what they should be
        if len(varimpSD) != len(varimp2):
            raise Exception("varimp lists seem to be wrong length: %s %s" % \
                (len(varimpSD), len(varimp2)))

        h2o_util.assertApproxEqual(sum(varimp2), 0.0, tol=1e-5, 
            msg="Shouldn't have all 0's in varimp %s" % varimp2)

    treeStats = rf_model['treeStats']
    if not treeStats:
        raise Exception("treeStats not right?: %s" % dump_json(treeStats))
    # print "json:", dump_json(rfv)
    data_key = rf_model['_dataKey']
    model_key = rf_model['_key']
    classification_error = pctWrong

    if not noPrint: 
        if 'minLeaves' not in treeStats or not treeStats['minLeaves']:
            raise Exception("treeStats seems to be missing minLeaves %s" % dump_json(treeStats))
        print """
         Leaves: {0} / {1} / {2}
          Depth: {3} / {4} / {5}
            Err: {6:0.2f} %
        """.format(
                treeStats['minLeaves'],
                treeStats['meanLeaves'],
                treeStats['maxLeaves'],
                treeStats['minDepth'],
                treeStats['meanDepth'],
                treeStats['maxDepth'],
                classification_error,
                )
    
    ### modelInspect = node.inspect(model_key)
    dataInspect = h2o_cmd.runInspect(key=data_key)
    check_sandbox_for_errors()
    return (round(classification_error,2), classErrorPctList, totalScores)