def predict_and_compare_csvs(model_key, hex_key, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o_nodes.nodes[0].generate_predictions(model_key=model_key, data_key=hex_key, destination_key=predictHexKey) print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds' check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o_nodes.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o_nodes.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong return pctWrong
def simpleCheckKMeans(self, kmeans, **kwargs): warnings = None if 'warnings' in kmeans: warnings = kmeans['warnings'] # catch the 'Failed to converge" for now x = re.compile("[Ff]ailed") for w in warnings: print "\nwarning:", w if re.search(x,w): raise Exception(w) # Check other things in the json response dictionary 'kmeans' here destination_key = kmeans['model']['_key'] # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame # can't use inspect on a model key? now? kmeansResult = kmeans model = kmeansResult['model'] centers = model["centers"] size = model["size"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] for i,c in enumerate(centers): for n in c: if math.isnan(float(n)): raise Exception("center", i, "has NaN:", n, "center:", c) # shouldn't have any errors check_sandbox_for_errors() return warnings
def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs): # if url param is used, use it as full url. otherwise create from the jsonRequest if fullUrl: url = fullUrl else: url = self.url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params2 = params.copy() for k in params2: if params2[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' extraComment2 = " " + str(postData)+";" if cmd=='post' else "" extraComment2 += extraComment if extraComment else "" if len(extraComment2) > 0: log('Start ' + url + paramsStr, comment=extraComment2) else: log('Start ' + url + paramsStr) # file get passed thru kwargs here if h2o_args.no_timeout: timeout = None # infinite try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) check_sandbox_for_errors(python_test_name=h2o_args.python_test_name); raise exc_info[1], None, exc_info[2]
def stabilize(self, test_func, error, timeoutSecs=10, retryDelaySecs=0.5): '''Repeatedly test a function waiting for it to return True. Arguments: test_func -- A function that will be run repeatedly error -- A function that will be run to produce an error message it will be called with (node, timeTakenSecs, numberOfRetries) OR -- A string that will be interpolated with a dictionary of { 'timeTakenSecs', 'numberOfRetries' } timeoutSecs -- How long in seconds to keep trying before declaring a failure retryDelaySecs -- How long to wait between retry attempts ''' start = time.time() numberOfRetries = 0 while h2o_args.no_timeout or (time.time() - start < timeoutSecs): if test_func(self, tries=numberOfRetries, timeoutSecs=timeoutSecs): break time.sleep(retryDelaySecs) numberOfRetries += 1 # hey, check the sandbox if we've been waiting a long time...rather than wait for timeout # to find the badness?. can check_sandbox_for_errors at any time if ((numberOfRetries % 50) == 0): check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) else: timeTakenSecs = time.time() - start if isinstance(error, type('')): raise Exception('%s failed after %.2f seconds having retried %d times' % ( error, timeTakenSecs, numberOfRetries)) else: msg = error(self, timeTakenSecs, numberOfRetries) raise Exception(msg)
def simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False): warnings = '' intercept = model.global_beta[-1] interceptName = model.coefficient_names[-1] coeffs = model.global_beta[:-1] coeffs_names = model.coefficient_names[:-1] assert len(coeffs) == (len(model.coefficient_names)-1) assert len(coeffs) == len(labelListUsed), "%s %s" % (coeffs, labelListUsed) # labelList still has the response column? # ignored columns aren't in model.names, but output response is. # labelListUsed has the response col removed so add 1 assert len(model.names) == (len(labelListUsed)+1), "%s %s" % (model.names, labelList) assert model.threshold!=0 print "len(coeffs)", len(coeffs) print "coeffs:", coeffs # last one is intercept if interceptName != "Intercept" or abs(intercept)<1e-26: raise Exception("'Intercept' should be last in coefficient_names and global_beta %s %s" % (interceptName, intercept)) y = parameters['response_column'] cString = "\n" for i,c in enumerate(coeffs_names): cString += "%s: %.5e " % (coeffs_names[i], coeffs[i]) print cString print "\nH2O intercept:\t\t%.5e" % intercept print "\nTotal # of coeffs:", len(coeffs_names) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ( "abs. value of GLM coeffs['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "parameters:" + dump_json(parameters) )) if (not allowZeroCoeff) and (len(coeffs)>1): s = 0.0 for c in coeffs: s += abs(float(c)) self.assertGreater(s, 1e-26, ( "sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" + "parameters:" + dump_json(parameters) )) # shouldn't have any errors check_sandbox_for_errors() return (warnings, coeffs, intercept)
def test(n, tries=None, timeoutSecs=14.0): c = n.get_cloud(noExtraErrorCheck=noExtraErrorCheck, timeoutSecs=timeoutSecs) # FIX! unique to h2o-dev for now, because of the port reuse problems (TCP_WAIT) compared to h2o # flag them early rather than after timeout check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # don't want to check everything. But this will check that the keys are returned! consensus = c['consensus'] locked = c['locked'] cloud_size = c['cloud_size'] cloud_name = c['cloud_name'] if 'nodes' not in c: emsg = "\nH2O didn't include a list of nodes in get_cloud response after initial cloud build" raise Exception(emsg) # only print it when you get consensus if cloud_size != node_count: print "\nNodes in cloud while building:" for i,ci in enumerate(c['nodes']): # 'h2o' disappeared? if 'h2o' not in ci: print "ci:", dump_json(ci) # apparently this can happen in cases where I didn't join a cloud because # of a different md5 version. We'll eventually exception out? # raise Exception("What happened to the 'h2o' ci dict entry?, not there") else: print "node %s" % i, ci['h2o'] ### print "node %s" % i, ci['h2o']['node'] if cloud_size > node_count: emsg = ( "\n\nERROR: cloud_size: %d reported via json is bigger than we expect: %d" % \ (cloud_size, node_count) + "\nLikely have zombie(s) with the same cloud name on the network." + "\nLook at the cloud IP's in 'grep Paxos sandbox/*stdout*' for some IP's you didn't expect." + "\n\nYou probably don't have to do anything, as the cloud shutdown in this test should" + "\nhave sent a Shutdown.json to all in that cloud (you'll see a kill -2 in the *stdout*)." + "\nIf you try again, and it still fails, go to those IPs and kill the zombie h2o's." + "\nIf you think you really have an intermittent cloud build, report it." + "\n" + "\nbuilding cloud size of 2 with 127.0.0.1 may temporarily report 3 incorrectly," + "\nwith no zombie?" ) for ci in c['nodes']: emsg += "\n" + ci['h2o']['node'] raise Exception(emsg) a = (cloud_size == node_count) and consensus if a: verboseprint("\tLocked won't happen until after keys are written") verboseprint("\nNodes in final cloud:") for ci in c['nodes']: verboseprint("ci", ci) # this isn't in there all the time? # verboseprint(ci['h2o']['node']) return a
def test(n, tries=None, timeoutSecs=14.0): c = n.get_cloud(noExtraErrorCheck=noExtraErrorCheck, timeoutSecs=timeoutSecs) # FIX! unique to h2o-dev for now, because of the port reuse problems (TCP_WAIT) compared to h2o # flag them early rather than after timeout check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # don't want to check everything. But this will check that the keys are returned! consensus = c["consensus"] locked = c["locked"] cloud_size = c["cloud_size"] cloud_name = c["cloud_name"] if "nodes" not in c: emsg = "\nH2O didn't include a list of nodes in get_cloud response after initial cloud build" raise Exception(emsg) # only print it when you get consensus if cloud_size != node_count: print "\nNodes in cloud while building:" for i, ci in enumerate(c["nodes"]): # 'h2o' disappeared? if "h2o" not in ci: print "ci:", dump_json(ci) # apparently this can happen in cases where I didn't join a cloud because # of a different md5 version. We'll eventually exception out? # raise Exception("What happened to the 'h2o' ci dict entry?, not there") else: print "node %s" % i, ci["h2o"] ### print "node %s" % i, ci['h2o']['node'] if cloud_size > node_count: emsg = ( "\n\nERROR: cloud_size: %d reported via json is bigger than we expect: %d" % (cloud_size, node_count) + "\nLikely have zombie(s) with the same cloud name on the network." + "\nLook at the cloud IP's in 'grep Paxos sandbox/*stdout*' for some IP's you didn't expect." + "\n\nYou probably don't have to do anything, as the cloud shutdown in this test should" + "\nhave sent a Shutdown.json to all in that cloud (you'll see a kill -2 in the *stdout*)." + "\nIf you try again, and it still fails, go to those IPs and kill the zombie h2o's." + "\nIf you think you really have an intermittent cloud build, report it." + "\n" + "\nbuilding cloud size of 2 with 127.0.0.1 may temporarily report 3 incorrectly," + "\nwith no zombie?" ) for ci in c["nodes"]: emsg += "\n" + ci["h2o"]["node"] raise Exception(emsg) a = (cloud_size == node_count) and consensus if a: verboseprint("\tLocked won't happen until after keys are written") verboseprint("\nNodes in final cloud:") for ci in c["nodes"]: verboseprint("ci", ci) # this isn't in there all the time? # verboseprint(ci['h2o']['node']) return a
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False): if h2o_args.sleep_at_tear_down: print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)" import h2o_browse as h2b h2b.browseTheCloud() sleep(3600) if not nodeList: nodeList = h2o_nodes.nodes # this could fail too. Should this be set by -uc/--usecloud? or command line argument if nodeList and nodeList[0].delete_keys_at_teardown: start = time.time() h2i.delete_keys_at_all_nodes(timeoutSecs=300) elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs" # could the nodeList still be empty in some exception cases? Assume not for now # FIX! don't send shutdown if we're using an existing cloud # also, copy the "delete keys at teardown from testdir_release # Assume there's a last "test" that's run to shutdown the cloud # don't tear down with -ccj either # FIX! what about usecloud or cloud_cloud_json params from build_cloud time? if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json): try: # update: send a shutdown to all nodes. # h2o maybe doesn't progagate well if sent to one node # the api watchdog shouldn't complain about this? # just send one? # for n in nodeList: # n.shutdown_all() h2o_nodes.nodes[0].shutdown_all() except: pass # ah subtle. we might get excepts in issuing the shutdown, don't abort out # of trying the process kills if we get any shutdown exception (remember we go to all nodes) # so we might? nodes are shutting down? # FIX! should we wait a bit for a clean shutdown, before we process kill? # It can take more than 1 sec though. try: time.sleep(2) for n in nodeList: n.terminate() verboseprint("tear_down_cloud n:", n) except: pass check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name) # get rid of all those pesky line marker files. Unneeded now clean_sandbox_doneToLine() nodeList[:] = [] h2o_nodes.nodes = []
def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs): # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params2 = params.copy() for k in params2: if params2[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest("----------------------------------------------------------------------\n") if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if cmd == 'post': r = requests.post(url, timeout=timeout, params=params, **kwargs) else: r = requests.get(url, timeout=timeout, params=params, **kwargs) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. # (this is new/experimental) exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) check_sandbox_for_errors(python_test_name=h2o_args.python_test_name); log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise exc_info[1], None, exc_info[2]
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, importParentDir=True, **kwargs): if not node: node = h2o_nodes.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs) verboseprint("importPattern:", importPattern) verboseprint("importResult", dump_json(importResult)) parseResult = parse_only(node, importPattern, hex_key, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) verboseprint("parseResult:", dump_json(parseResult)) # do SummaryPage here too, just to get some coverage # only if not noPoll. otherwise parse isn't done if doSummary and not noPoll: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up check_sandbox_for_errors() inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs) numRows = inspect['numRows'] numCols = inspect['numCols'] # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma) node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing check_sandbox_for_errors() return parseResult
def checkScoreResult(self, result, expectedErr, relTol, **kwargs): print "Expected score error: " + format(expectedErr) print "Actual score error: " + format(result['classification_error']) if result['classification_error'] != expectedErr and abs((expectedErr - result['classification_error'])/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (result['classification_error'], float(relTol)*100, expectedErr)) warnings = None # shouldn't have any errors check_sandbox_for_errors() return (warnings)
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. # if schema=='put': # h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + # "\nMeans multi-machine with 'put' will fail") # schema = 'local' if not node: node = h2o_nodes.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs) verboseprint("importPattern:", importPattern) verboseprint("importResult", dump_json(importResult)) assert len(importResult['keys']) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path) # print "importResult:", importResult # get rid of parse timing in tests now start = time.time() parseResult = parse_only(node, importPattern, hex_key, importResult['keys'], timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) elapsed = time.time() - start print importPattern, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" parseResult['python_elapsed'] = elapsed verboseprint("parseResult:", dump_json(parseResult)) # do SummaryPage here too, just to get some coverage # only if not noPoll. otherwise parse isn't done if doSummary and not noPoll: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up check_sandbox_for_errors() print "WARNING: not doing inspect/summary for now after parse" ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs) ## numRows = inspect['numRows'] ## numCols = inspect['numCols'] # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma) ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing check_sandbox_for_errors() return parseResult
def checkScoreResult(self, result, expectedErr, relTol, **kwargs): print "Expected score error: " + format(expectedErr) print "Actual score error: " + format(result['classification_error']) if result['classification_error'] != expectedErr and abs( (expectedErr - result['classification_error']) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (result['classification_error'], float(relTol) * 100, expectedErr)) warnings = None # shouldn't have any errors check_sandbox_for_errors() return (warnings)
def checkH2OLogs(timeoutSecs=3, expectedMinLines=12, suffix="-1-trace"): # download logs from node 0 (this will overwrite) h2o_nodes.nodes[0].log_download(timeoutSecs=timeoutSecs) # I guess we really don't need to get the list of nodes names from get_cloud any more # h2o_172.16.2.222_54321-1-trace.log # h2o_172.16.2.222_54321-2-debug.log # h2o_172.16.2.222_54321-3-info.log # h2o_172.16.2.222_54321-4-warn.log # h2o_172.16.2.222_54321-5-error.log # h2o_172.16.2.222_54321-6-fatal.log def checkit(suffix, expectedMinLines): logNameList = [ "h2o_" + str(n.http_addr) + "_" + str(n.port) + suffix + ".log" for n in h2o_nodes.nodes ] lineCountList = [] for logName in logNameList: lineCount = h2o_util.file_line_count(get_sandbox_name() + "/" + logName) print logName, "lineCount:", lineCount lineCountList.append(lineCount) print logNameList if len(h2o_nodes.nodes) != len(logNameList): raise Exception("Should be %d logs, are %d" % len(h2o_nodes.nodes), len(logNameList)) # line counts seem to vary..check for "too small" # variance in polling (cloud building and status)? for i, l in enumerate(lineCountList): if l < expectedMinLines: raise Exception("node %d %s log is too small" % (i, logNameList[i])) return (logNameList, lineCountList) # just asssume the main ones meet the min requirement..and the error ones are min 0 (logNameList, lineCountList) = checkit("-1-trace", expectedMinLines) checkit("-2-debug", expectedMinLines) checkit("-3-info", expectedMinLines) checkit("-4-warn", 0) checkit("-5-error", 0) checkit("-6-fatal", 0) # now that all the logs are there check_sandbox_for_errors() return (logNameList, lineCountList)
def checkLastValidationError(self, model, rows, expectedErr, relTol, **kwargs): errsLast = model['validation_errors'][-1] # last scoring result verboseprint("Deep Learning 'Last scoring on test set:'", dump_json(errsLast)) expectedSamples = rows * kwargs['epochs'] print 'Expecting ' + format(expectedSamples) + ' training samples' if errsLast['training_samples'] != expectedSamples: raise Exception("Number of training samples should be equal to %s" % expectedSamples) print "Expected test set error: " + format(expectedErr) print "Actual test set error: " + format(errsLast['classification']) if errsLast['classification'] != expectedErr and abs((expectedErr - errsLast['classification'])/expectedErr) > relTol: raise Exception("Test set classification error of %s is not within %s %% relative error of %s" % (errsLast['classification'], float(relTol)*100, expectedErr)) warnings = None # shouldn't have any errors check_sandbox_for_errors() return (warnings)
def do_json_request(addr=None, port=None, jsonRequest=None, params=None, timeout=7, **kwargs): if params is not None: paramsStr = '?' + '&'.join( ['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' url = create_url(addr, port, jsonRequest) print 'Start ' + url + paramsStr try: r = requests.get(url, timeout=timeout, params=params, **kwargs) # the requests json decoder might fail if we didn't get something good rjson = r.json() emsg = "ERROR: Probing claimed existing cloud with Cloud.json" if not isinstance(rjson, (list, dict)): # probably good raise Exception(emsg + "h2o json responses should always be lists or dicts. Got %s" %\ dump_json(rj)) elif r.status_code != requests.codes.ok: rjson = None raise Exception(emsg + "Couldn't decode. Status: %s" % r.status_code) except requests.ConnectionError, e: rjson = None emsg = "ERROR: json got ConnectionError or other exception" # Rethrow the exception after we've checked for stack trace from h2o. # Out of memory errors maybe don't show up right away? # so we should wait for h2o to get it out to h2o stdout. # Don't want to rely on cloud teardown to check because there's no delay, # and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # we don't expect to have connection errors, so any exception is a bad thing. h2p.red_print("%s\n %s\n %s\nGoing to check sandbox, then rethrow.." % (emsg, exc_info, url + paramsStr)) time.sleep(2) check_sandbox_for_errors() raise exc_info[1], None, exc_info[2]
def checkH2OLogs(timeoutSecs=3, expectedMinLines=12, suffix="-1-trace"): # download logs from node 0 (this will overwrite) h2o_nodes.nodes[0].log_download(timeoutSecs=timeoutSecs) # I guess we really don't need to get the list of nodes names from get_cloud any more # h2o_172.16.2.222_54321-1-trace.log # h2o_172.16.2.222_54321-2-debug.log # h2o_172.16.2.222_54321-3-info.log # h2o_172.16.2.222_54321-4-warn.log # h2o_172.16.2.222_54321-5-error.log # h2o_172.16.2.222_54321-6-fatal.log def checkit(suffix, expectedMinLines): logNameList = ["h2o_" + str(n.http_addr) + "_" + str(n.port) + suffix + ".log" for n in h2o_nodes.nodes] lineCountList = [] for logName in logNameList: lineCount = h2o_util.file_line_count(get_sandbox_name() + "/" + logName) print logName, "lineCount:", lineCount lineCountList.append(lineCount) print logNameList if len(h2o_nodes.nodes) != len(logNameList): raise Exception("Should be %d logs, are %d" % len(h2o_nodes.nodes), len(logNameList)) # line counts seem to vary..check for "too small" # variance in polling (cloud building and status)? for i, l in enumerate(lineCountList): if l < expectedMinLines: raise Exception("node %d %s log is too small" % (i, logNameList[i])) return (logNameList, lineCountList) # just asssume the main ones meet the min requirement..and the error ones are min 0 (logNameList, lineCountList) = checkit("-1-trace", expectedMinLines) checkit("-2-debug", expectedMinLines) checkit("-3-info", expectedMinLines) checkit("-4-warn", 0) checkit("-5-error", 0) checkit("-6-fatal", 0) # now that all the logs are there check_sandbox_for_errors() return (logNameList, lineCountList)
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, importParentDir=True, **kwargs): if not node: node = h2o_nodes.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs) verboseprint("importPattern:", importPattern) verboseprint("importResult", dump_json(importResult)) parseResult = parse_only(node, importPattern, hex_key, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) verboseprint("parseResult:", dump_json(parseResult)) # do SummaryPage here too, just to get some coverage # only if not noPoll. otherwise parse isn't done if doSummary and not noPoll: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up check_sandbox_for_errors() print "WARNING: not doing inspect/summary for now after parse" ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs) ## numRows = inspect['numRows'] ## numCols = inspect['numCols'] # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma) ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing check_sandbox_for_errors() return parseResult
def do_json_request(addr=None, port=None, jsonRequest=None, params=None, timeout=7, **kwargs): if params is not None: paramsStr = '?' + '&'.join(['%s=%s' % (k,v) for (k,v) in params.items()]) else: paramsStr = '' url = create_url(addr, port, jsonRequest) print 'Start ' + url + paramsStr try: r = requests.get(url, timeout=timeout, params=params, **kwargs) # the requests json decoder might fail if we didn't get something good rjson = r.json() emsg = "ERROR: Probing claimed existing cloud with Cloud.json" if not isinstance(rjson, (list,dict)): # probably good raise Exception(emsg + "h2o json responses should always be lists or dicts. Got %s" %\ dump_json(rj)) elif r.status_code != requests.codes.ok: rjson = None raise Exception(emsg + "Couldn't decode. Status: %s" % r.status_code) except requests.ConnectionError, e: rjson = None emsg = "ERROR: json got ConnectionError or other exception" # Rethrow the exception after we've checked for stack trace from h2o. # Out of memory errors maybe don't show up right away? # so we should wait for h2o to get it out to h2o stdout. # Don't want to rely on cloud teardown to check because there's no delay, # and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # we don't expect to have connection errors, so any exception is a bad thing. h2p.red_print( "%s\n %s\n %s\nGoing to check sandbox, then rethrow.." % (emsg, exc_info, url + paramsStr)) time.sleep(2) check_sandbox_for_errors() raise exc_info[1], None, exc_info[2]
def checkLastValidationError(self, model, rows, expectedErr, relTol, **kwargs): errsLast = model['validation_errors'][-1] # last scoring result verboseprint("Deep Learning 'Last scoring on test set:'", dump_json(errsLast)) expectedSamples = rows * kwargs['epochs'] print 'Expecting ' + format(expectedSamples) + ' training samples' if errsLast['training_samples'] != expectedSamples: raise Exception("Number of training samples should be equal to %s" % expectedSamples) print "Expected test set error: " + format(expectedErr) print "Actual test set error: " + format(errsLast['classification']) if errsLast['classification'] != expectedErr and abs( (expectedErr - errsLast['classification']) / expectedErr) > relTol: raise Exception( "Test set classification error of %s is not within %s %% relative error of %s" % (errsLast['classification'], float(relTol) * 100, expectedErr)) warnings = None # shouldn't have any errors check_sandbox_for_errors() return (warnings)
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False): if h2o_args.sleep_at_tear_down: print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)" import h2o_browse as h2b h2b.browseTheCloud() sleep(3600) # we keep a copy of whatever was built here too, just in case! # we can't refer to h2o.nodes[] because of circular import? if not nodeList: nodeList = h2o_nodes.nodes # could the nodeList still be empty in some exception cases? Assume not for now try: # update: send a shutdown to all nodes. h2o maybe doesn't progagate well if sent to one node # the api watchdog shouldn't complain about this? for n in nodeList: n.shutdown_all() except: pass # ah subtle. we might get excepts in issuing the shutdown, don't abort out # of trying the process kills if we get any shutdown exception (remember we go to all nodes) # so we might? nodes are shutting down? # FIX! should we wait a bit for a clean shutdown, before we process kill? # It can take more than 1 sec though. try: time.sleep(2) for n in nodeList: n.terminate() verboseprint("tear_down_cloud n:", n) except: pass check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name) # get rid of all those pesky line marker files. Unneeded now clean_sandbox_doneToLine() nodeList[:] = [] h2o_nodes.nodes = []
def exec_expr_list_rand(lenNodes, exprList, keyX, # exec2 uses R "start with 1" behavior? minCol=1, maxCol=55, minRow=1, maxRow=400000, maxTrials=200, timeoutSecs=10, ignoreH2oError=False, allowEmptyResult=False, nanOkay=False): trial = 0 while trial < maxTrials: exprTemplate = random.choice(exprList) # UPDATE: all execs are to a single node. No mixed node streams # eliminates some store/store race conditions that caused problems. # always go to node 0 (forever?) if lenNodes is None: execNode = 0 else: # execNode = random.randint(0,lenNodes-1) execNode = 0 ## print "execNode:", execNode colX = random.randint(minCol,maxCol) # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(minRow,maxRow)) execExpr = fill_in_expr_template(exprTemplate, colX, ((trial+1)%4)+1, row, keyX) (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs, ignoreH2oError) checkScalarResult(resultExec, None, allowEmptyResult=allowEmptyResult, nanOkay=nanOkay) if keyX: inspect = h2o_cmd.runInspect(key=keyX) print keyX, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) sys.stdout.write('.') sys.stdout.flush() ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation if check_sandbox_for_errors(): raise Exception( "Found errors in sandbox stdout or stderr, on trial #%s." % trial) trial += 1 print "Trial #", trial, "completed\n"
def exec_expr_list_across_cols(lenNodes, exprList, keyX, minCol=0, maxCol=55, timeoutSecs=10, incrementingResult=True, **kwargs): colResultList = [] for colX in range(minCol, maxCol): for i, exprTemplate in enumerate(exprList): # do each expression at a random node, to facilate key movement # UPDATE: all execs are to a single node. No mixed node streams # eliminates some store/store race conditions that caused problems. # always go to node 0 (forever?) if lenNodes is None: execNode = 0 else: ### execNode = random.randint(0,lenNodes-1) ### print execNode execNode = 0 execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX) if incrementingResult: # the Result<col> pattern resultKey = "Result" + str(colX) else: # assume it's a re-assign to self resultKey = keyX # v2 (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs, **kwargs) # print "\nexecResult:", dump_json(resultExec) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation if check_sandbox_for_errors(): raise Exception( "Found errors in sandbox stdout or stderr, on trial #%s." % trial) ### print "Column #", colX, "completed\n" colResultList.append(result) return colResultList
def exec_expr_list_across_cols(lenNodes, exprList, keyX, minCol=0, maxCol=54, timeoutSecs=10, incrementingResult=True): colResultList = [] for colX in range(minCol, maxCol): for i, exprTemplate in enumerate(exprList): # do each expression at a random node, to facilate key movement # UPDATE: all execs are to a single node. No mixed node streams # eliminates some store/store race conditions that caused problems. # always go to node 0 (forever?) if lenNodes is None: execNode = 0 else: ### execNode = random.randint(0,lenNodes-1) ### print execNode execNode = 0 execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX) if incrementingResult: # the Result<col> pattern resultKey = "Result"+str(colX) else: # assume it's a re-assign to self resultKey = keyX # v2 (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs) print "\nexecResult:", dump_json(resultExec) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation if check_sandbox_for_errors(): raise Exception( "Found errors in sandbox stdout or stderr, on trial #%s." % trial) print "Column #", colX, "completed\n" colResultList.append(result) return colResultList
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs): # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter # h2o GLM will verboseprint the result and print errors. # so don't have to do that # different when cross validation is used? No trainingErrorDetails? GLMModel = glm['glm_model'] if not GLMModel: raise Exception("GLMModel didn't exist in the glm response? %s" % dump_json(glm)) warnings = None if 'warnings' in GLMModel and GLMModel['warnings']: warnings = GLMModel['warnings'] # stop on failed x = re.compile("failed", re.IGNORECASE) # don't stop if fail to converge c = re.compile("converge", re.IGNORECASE) for w in warnings: print "\nwarning:", w if re.search(x,w) and not allowFailWarning: if re.search(c,w): # ignore the fail to converge warning now pass else: # stop on other 'fail' warnings (are there any? fail to solve? raise Exception(w) # for key, value in glm.iteritems(): print key # not in GLMGrid? # FIX! don't get GLMParams if it can't solve? GLMParams = GLMModel['glm'] family = GLMParams["family"] # number of submodels = number of lambda # min of 2. lambda_max is first submodels = GLMModel['submodels'] # since all our tests?? only use one lambda, the best_lamda_idx should = 1 best_lambda_idx = GLMModel['best_lambda_idx'] print "best_lambda_idx:", best_lambda_idx lambda_max = GLMModel['lambda_max'] print "lambda_max:", lambda_max # currently lambda_max is not set by tomas. ..i.e.not valid if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value): raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value)) # submodels0 = submodels[0] # submodels1 = submodels[-1] # hackery to make it work when there's just one if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels))) if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels))) submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one iterations = submodels1['iteration'] print "GLMModel/iterations:", iterations # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter if maxExpectedIterations is not None and iterations > maxExpectedIterations: raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) ) if 'validation' not in submodels1: raise Exception("Should be a 'validation' key in submodels1: %s" % dump_json(submodels1)) validationsList = submodels1['validation'] validations = validationsList # xval. compare what we asked for and what we got. n_folds = kwargs.setdefault('n_folds', None) print "GLMModel/validations" validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance']) validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance']) print "%15s %s" % ("null_deviance:\t", validations['null_deviance']) print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance']) # threshold only there if binomial? # auc only for binomial if family=="binomial": print "%15s %s" % ("auc:\t", validations['auc']) best_threshold = validations['best_threshold'] thresholds = validations['thresholds'] print "%15s %s" % ("best_threshold:\t", best_threshold) # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t >= best_threshold: # ends up using next one if not present best_index = i break assert best_index!=None, "%s %s" % (best_threshold, thresholds) print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1] submodels = glm['glm_model']['submodels'] # FIX! this isn't right if we have multiple lambdas? different submodels? cms = submodels[0]['validation']['_cms'] self.assertEqual(len(thresholds), len(cms), msg="thresholds %s and cm %s should be lists of the same size. %s" % (len(thresholds), len(cms), thresholds)) # FIX! best_threshold isn't necessarily in the list. jump out if >= assert best_index<len(cms), "%s %s" % (best_index, len(cms)) # if we want 0.5..rounds to int # mid = len(cms)/2 # cm = cms[mid] cm = cms[best_index] print "cm:", dump_json(cm['_arr']) predErr = cm['_predErr'] classErr = cm['_classErr'] # compare to predErr pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); print "predErr:", predErr print "calculated pctWrong from cm:", pctWrong print "classErr:", classErr # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) if family=="poisson" or family=="gaussian": print "%15s %s" % ("aic:\t", validations['aic']) coefficients_names = GLMModel['coefficients_names'] # print "coefficients_names:", coefficients_names idxs = submodels1['idxs'] print "idxs:", idxs coefficients_names = coefficients_names # always check both normalized and normal coefficients norm_beta = submodels1['norm_beta'] # if norm_beta and len(coefficients_names)!=len(norm_beta): # print len(coefficients_names), len(norm_beta) # raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta)) # beta = submodels1['beta'] # print "beta:", beta # if len(coefficients_names)!=len(beta): # print len(coefficients_names), len(beta) # raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta)) # test wants to use normalized? if doNormalized: beta_used = norm_beta else: beta_used = beta coefficients = {} # create a dictionary with name, beta (including intercept) just like v1 for i,b in zip(idxs, beta_used[:-1]): name = coefficients_names[i] coefficients[name] = b print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used) print "coefficients:", coefficients print "beta:", beta print "norm_beta:", norm_beta coefficients['Intercept'] = beta_used[-1] print "len(coefficients_names)", len(coefficients_names) print "len(idxs)", len(idxs) print "idxs[-1]", idxs[-1] print "intercept demapping info:", \ "coefficients_names[-i]:", coefficients_names[-1], \ "idxs[-1]:", idxs[-1], \ "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \ "beta_used[-1]:", beta_used[-1], \ "coefficients['Intercept']", coefficients['Intercept'] # last one is intercept interceptName = coefficients_names[idxs[-1]] if interceptName != "Intercept" or abs(beta_used[-1])<1e-26: raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\ (idxs[-1], beta_used[-1], "-"+interceptName+"-")) # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names # new 5/28/14. glm can point to zero coefficients # for i in idxs: # if beta_used[i]==0.0: ## raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i])) if len(idxs) > len(beta_used): raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used))) intercept = coefficients.pop('Intercept', None) # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099 # the last one shoudl be 'Intercept' ? coefficients_names.pop() # have to skip the output col! get it from kwargs # better always be there! y = kwargs['response'] # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names' # from the response # Tomas created 'coefficients_names which is the coefficient list in order. # Just use it to index coefficients! works for header or no-header cases # I guess now we won't print the "None" cases for dropped columns (constant columns!) # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before # he gets it? def add_to_coefficient_list_and_string(c, cList, cString): if c in coefficients: cValue = coefficients[c] cValueString = "%s: %.5e " % (c, cValue) else: print "Warning: didn't see '" + c + "' in json coefficient response.",\ "Inserting 'None' with assumption it was dropped due to constant column)" cValue = None cValueString = "%s: %s " % (c, cValue) cList.append(cValue) # we put each on newline for easy comparison to R..otherwise keep condensed if prettyPrint: cValueString = "H2O coefficient " + cValueString + "\n" # not mutable? return cString + cValueString # creating both a string for printing and a list of values cString = "" cList = [] # print in order using col_names # coefficients_names is input only now..same for header or no header, or expanded enums for c in coefficients_names: cString = add_to_coefficient_list_and_string(c, cList, cString) if prettyPrint: print "\nH2O intercept:\t\t%.5e" % intercept print cString else: if not noPrint: print "\nintercept:", intercept, cString print "\nTotal # of coefficients:", len(coefficients_names) # pick out the coefficent for the column we enabled for enhanced checking. Can be None. # FIX! temporary hack to deal with disappearing/renaming columns in GLM if (not allowZeroCoeff) and (colX is not None): absXCoeff = abs(float(coefficients[str(colX)])) # add kwargs to help debug without looking at console log self.assertGreater(absXCoeff, 1e-26, ( "abs. value of GLM coefficients['" + str(colX) + "'] is " + str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) + "\n" + "kwargs:" + dump_json(kwargs) )) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ( "abs. value of GLM coefficients['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "kwargs:" + dump_json(kwargs) )) # this is good if we just want min or max # maxCoeff = max(coefficients, key=coefficients.get) # for more, just invert the dictionary and ... if (len(coefficients)>0): maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey] minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey] else: print "Warning, no coefficients returned. Must be intercept only?" # many of the GLM tests aren't single column though. # quick and dirty check: if all the coefficients are zero, # something is broken # intercept is in there too, but this will get it okay # just sum the abs value up..look for greater than 0 # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff? if (not allowZeroCoeff) and (len(coefficients)>1): s = 0.0 for c in coefficients: v = coefficients[c] s += abs(float(v)) self.assertGreater(s, 1e-26, ( "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26\n" + "kwargs:" + dump_json(kwargs) )) print "submodels1, run_time (milliseconds):", submodels1['run_time'] # shouldn't have any errors check_sandbox_for_errors() return (warnings, cList, intercept)
def pollWaitJobs(pattern=None, errorIfCancelled=False, timeoutSecs=60, pollTimeoutSecs=60, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=None): wait = True waitTime = 0 ignoredJobs = set() while (wait): a = h2o_nodes.nodes[0].jobs(timeoutSecs=pollTimeoutSecs) verboseprint("jobs():", dump_json(a)) jobs = a['jobs'] busy = 0 for j in jobs: cancelled = j['status'] == 'CANCELLED' description = j['description'] key = j['key'] jobKey = key['name'] jobKeyType = key['type'] # "key": { # "URL": "/3/Jobs.json/$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2", # "__meta": { # "schema_name": "JobKeyV1", # "schema_type": "Key<Job>", # "schema_version": 1 # }, # "name": "$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2", # "type": "Key<Job>" # progress = j['progress'] progress_msg = j['progress_msg'] # has exception and val? start_time = j['start_time'] end_time = j.get('end_time', None) dest = j['dest'] description = j['description'] msec = j.get('msec', None) # for now, don't ignore any exceptions # FIX! what do exceptions look like now? if 'exception' in j and j['exception']: check_sandbox_for_errors() msg = "ERROR: pollWaitJobs found a job with a exception result when it shouldn't have:\n %s" % dump_json( j) raise Exception(msg) if errorIfCancelled and cancelled: check_sandbox_for_errors() print( "ERROR: not stopping, but: pollWaitJobs found a cancelled job when it shouldn't have:\n %s" % dump_json(j)) print( "Continuing so maybe a json response will give more info") ### verboseprint(j) # don't include cancelled jobs here elif j['status'] != 'DONE': if not pattern: # always print progress if busy job (no pattern used print "time:", time.strftime( "%I:%M:%S"), "progress:", progress, dest verboseprint("description:", description, "end_time:", end_time) busy += 1 verboseprint("pollWaitJobs: found a busy job, now: %s" % busy) else: if (pattern in key) or (pattern in dest) or (pattern in description): ## print "description:", description, "end_time:", end_time busy += 1 verboseprint( "pollWaitJobs: found a pattern-matched busy job, now %s" % busy) # always print progress if pattern is used and matches print "time:", time.strftime( "%I:%M:%S"), "progress:", progress, dest # we only want to print the warning message once elif key not in ignoredJobs: jobMsg = "%s %s %s" % (key, description, dest) verboseprint( " %s job in progress but we're ignoring it. Doesn't match pattern." % jobMsg) # I guess "key" is supposed to be unique over all time for a job id? ignoredJobs.add(key) if stallForNJobs: waitFor = stallForNJobs else: waitFor = 0 print " %s jobs in progress." % busy, "Waiting until %s in progress." % waitFor wait = busy > waitFor if not wait: break ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs") if not h2o_args.no_timeout and (wait and waitTime > timeoutSecs): print dump_json(jobs) raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds") sys.stdout.write('.') sys.stdout.flush() time.sleep(retryDelaySecs) waitTime += retryDelaySecs # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack) # test would pass ['cpu','disk','jstack'] kind of list if benchmarkLogging: h2o.cloudPerfH2O.get_log_save(benchmarkLogging) # check the sandbox for stack traces! just like we do when polling normally check_sandbox_for_errors() patternKeys = [] for j in jobs: # save the destination keys in progress that match pattern (for returning) if pattern and pattern in j['dest']: patternKeys.append(j['dest']) return patternKeys
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. # if schema=='put': # h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + # "\nMeans multi-machine with 'put' will fail") # schema = 'local' if not node: node = h2o_nodes.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs) verboseprint("importPattern:", importPattern) verboseprint("importResult", dump_json(importResult)) assert len( importResult['keys'] ) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path) # print "importResult:", importResult # get rid of parse timing in tests now start = time.time() parseResult = parse_only(node, importPattern, hex_key, importResult['keys'], timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) elapsed = time.time() - start print importPattern, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs), "\n" parseResult['python_elapsed'] = elapsed verboseprint("parseResult:", dump_json(parseResult)) # do SummaryPage here too, just to get some coverage # only if not noPoll. otherwise parse isn't done if doSummary and not noPoll: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up check_sandbox_for_errors() print "WARNING: not doing inspect/summary for now after parse" ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs) ## numRows = inspect['numRows'] ## numCols = inspect['numCols'] # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma) ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing check_sandbox_for_errors() return parseResult
def build_cloud_with_json(h2o_nodes_json="h2o-nodes.json"): # local sandbox may not exist. Don't clean if it does, just append if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud_with_json()") log("#*********************************************************************") print "This only makes sense if h2o is running as defined by", h2o_nodes_json print "For now, assuming it's a cloud on this machine, and here's info on h2o processes running here" print "No output means no h2o here! Some other info about stuff on the system is printed first though." import h2o_os_util if not os.path.exists(h2o_nodes_json): raise Exception("build_cloud_with_json: Can't find " + h2o_nodes_json + " file") ## h2o_os_util.show_h2o_processes() with open(h2o_nodes_json, "rb") as f: cloneJson = json.load(f) # These are supposed to be in the file. # Just check the first one. if not there, the file must be wrong if not "cloud_start" in cloneJson: raise Exception("Can't find 'cloud_start' in %s, wrong file? h2o-nodes.json?" % h2o_nodes_json) else: cs = cloneJson["cloud_start"] print "Info on the how the cloud we're cloning was started (info from %s)" % h2o_nodes_json # required/legal values in 'cloud_start'. A robust check is good for easy debug when we add stuff valList = ["time", "cwd", "python_test_name", "python_cmd_line", "config_json", "username", "ip"] for v in valList: if v not in cs: raise Exception("Can't find %s in %s, wrong file or version change?" % (v, h2o_nodes_json)) print "cloud_start['%s']: %s" % (v, cs[v]) # this is the internal node state for python..nodes rebuild nodeStateList = cloneJson["h2o_nodes"] nodeList = [] if not nodeStateList: raise Exception("nodeStateList is empty. %s file must be empty/corrupt" % h2o_nodes_json) try: for nodeState in nodeStateList: print "Cloning state for node", nodeState["node_id"], "from", h2o_nodes_json newNode = ExternalH2O(nodeState) nodeList.append(newNode) # If it's an existing cloud, it may already be locked. so never check. # we don't have the cloud name in the -ccj since it may change (and the file be static?) # so don't check expectedCloudName verify_cloud_size(nodeList, expectedCloudName=None, expectedLocked=None) # best to check for any errors right away? # (we won't report errors from prior tests due to marker stuff? ## check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() except: # nodeList might be empty in some exception cases? # no shutdown issued first, though ## if cleanup and nodeList: ## for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_nodes_json, "we used to", LOG_DIR shutil.copy(h2o_nodes_json, LOG_DIR + "/" + os.path.basename(h2o_nodes_json)) print "" h2p.red_print("Ingested from json:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes") print "" # save it to a global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def poll_url(self, response, timeoutSecs=10, retryDelaySecs=0.5, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, reuseFirstPollUrl=False, noPrint=False): verboseprint('poll_url input: response:', dump_json(response)) ### print "poll_url: pollTimeoutSecs", pollTimeoutSecs ### print "at top of poll_url, timeoutSecs: ", timeoutSecs # for the rev 2 stuff..the job_key, destination_key and redirect_url are just in the response # look for 'response'..if not there, assume the rev 2 def get_redirect_url(response): url = None params = None # StoreView has old style, while beta_features if 'response_info' in response: response_info = response['response_info'] if 'redirect_url' not in response_info: raise Exception("Response during polling must have 'redirect_url'\n%s" % dump_json(response)) if response_info['status'] != 'done': redirect_url = response_info['redirect_url'] if redirect_url: url = self.url(redirect_url) params = None else: if response_info['status'] != 'done': raise Exception( "'redirect_url' during polling is null but status!='done': \n%s" % dump_json(response)) else: if 'response' not in response: raise Exception("'response' not in response.\n%s" % dump_json(response)) if response['response']['status'] != 'done': if 'redirect_request' not in response['response']: raise Exception("'redirect_request' not in response. \n%s" % dump_json(response)) url = self.url(response['response']['redirect_request']) params = response['response']['redirect_request_args'] return (url, params) # if we never poll msgUsed = None if 'response_info' in response: # trigger v2 for GBM always? status = response['response_info']['status'] progress = response.get('progress', "") else: r = response['response'] status = r['status'] progress = r.get('progress', "") doFirstPoll = status != 'done' (url, params) = get_redirect_url(response) # no need to recreate the string for messaging, in the loop.. if params: paramsStr = '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' # FIX! don't do JStack noise for tests that ask for it. JStack seems to have problems noise_enable = noise and noise != ("JStack", None) if noise_enable: print "Using noise during poll_url:", noise # noise_json should be like "Storeview" (noise_json, noiseParams) = noise noiseUrl = self.url(noise_json + ".json") if noiseParams is None: noiseParamsStr = "" else: noiseParamsStr = '&'.join(['%s=%s' % (k, v) for (k, v) in noiseParams.items()]) start = time.time() count = 0 if initialDelaySecs: time.sleep(initialDelaySecs) # can end with status = 'redirect' or 'done' # Update: on DRF2, the first RF redirects to progress. So we should follow that, and follow any redirect to view? # so for v2, we'll always follow redirects? # For v1, we're not forcing the first status to be 'poll' now..so it could be redirect or done?(NN score? if blocking) # Don't follow the Parse redirect to Inspect, because we want parseResult['destination_key'] to be the end. # note this doesn't affect polling with Inspect? (since it doesn't redirect ? while status == 'poll' or doFirstPoll or (status == 'redirect' and 'Inspect' not in url): count += 1 if ((time.time() - start) > timeoutSecs): # show what we're polling with emsg = "Exceeded timeoutSecs: %d secs while polling." % timeoutSecs + \ "status: %s, url: %s?%s" % (status, urlUsed, paramsUsedStr) raise Exception(emsg) if benchmarkLogging: import h2o h2o.cloudPerfH2O.get_log_save(benchmarkLogging) # every other one? create_noise = noise_enable and ((count % 2) == 0) if create_noise: urlUsed = noiseUrl paramsUsed = noiseParams paramsUsedStr = noiseParamsStr msgUsed = "\nNoise during polling with" else: urlUsed = url paramsUsed = params paramsUsedStr = paramsStr msgUsed = "\nPolling with" print status, progress, urlUsed time.sleep(retryDelaySecs) response = self.do_json_request(fullUrl=urlUsed, timeout=pollTimeoutSecs, params=paramsUsed) verboseprint(msgUsed, urlUsed, paramsUsedStr, "Response:", dump_json(response)) # hey, check the sandbox if we've been waiting a long time...rather than wait for timeout if ((count % 6) == 0): check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) if (create_noise): # this guarantees the loop is done, so we don't need to worry about # a 'return r' being interpreted from a noise response status = 'poll' progress = '' else: doFirstPoll = False status = response['response_info']['status'] progress = response.get('progress', "") # get the redirect url if not reuseFirstPollUrl: # reuse url for all v1 stuff (url, params) = get_redirect_url(response) if noPoll: return response # won't print if we didn't poll if msgUsed: verboseprint(msgUsed, urlUsed, paramsUsedStr, "Response:", dump_json(response)) return response
def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs): super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint) print self.withinmse # per cluster print self.avgss print self.avgwithinss print self.avgbetweenss # should model builder add this to the kmeansResult? if 'python_elapsed' in kmeansResult: self.python_elapsed = kmeansResult['python_elapsed'] rows = self.rows # [78, 5, 41, 76] model_category = self.model_category # Clustering iters = self.iters # 11.0 domains = self.domains # [None, None, None, None, None, None, None, None, None, None, None, None, None, None] names = self.names # [u'STR', u'OBS', u'AGMT', u'FNDX', u'HIGD', u'DEG', u'CHK', u'AGP1', u'AGMN', u'NLV', u'LIV', u'WT', u'AGLP', u'MST'] ncats = self.ncats # 0 clusters = self.clusters # [ 4 lists of centers ] withinmse = self.withinmse avgss = self.avgss if numRows: assert numRows==sum(rows) if 'k' in parameters: k = parameters['k'] assert len(clusters) == k assert len(rows) == k if numCols: assert len(names) == numCols, \ "Need to pass correct numCols after ignored columns decrement %s %s" % (len(names), numCols) for c in clusters: assert len(c) == numCols, "%s %s" % (len(c), numCols) # this should be true if labels: assert len(labels) == numCols, \ "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols) assert len(labels) == len(names), \ "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names)) assert labels == names if 'max_iters' in parameters: max_iters = parameters['max_iters'] assert max_iters >= iters # we could check the centers are within the min/max of each column for i,c in enumerate(clusters): for n in c: if math.isnan(float(n)): raise Exception("cluster", i, "has NaN:", n, "center:", c) # shouldn't have any errors check_sandbox_for_errors() # create a tuple for each cluster result, then sort by rows for easy comparison # maybe should sort by centers? # put a cluster index in there too, (leftmost) so we don't lose track tuples = zip(range(len(clusters)), withinmse, rows, clusters) self.tuplesSorted = sorted(tuples, key=itemgetter(3)) # undo for printing what the caller will see ids, withinmse, rows, clusters = zip(*self.tuplesSorted) print "iters:", iters print "ids:", ids print "withinmse:", withinmse print "rows:", rows for i,c in enumerate(clusters): print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c) print "KMeansObj created for:", "???"# vars(self)
def build_cloud_with_json(h2o_nodes_json='h2o-nodes.json'): # local sandbox may not exist. Don't clean if it does, just append if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud_with_json()") log("#*********************************************************************") print "This only makes sense if h2o is running as defined by", h2o_nodes_json print "For now, assuming it's a cloud on this machine, and here's info on h2o processes running here" print "No output means no h2o here! Some other info about stuff on the system is printed first though." import h2o_os_util if not os.path.exists(h2o_nodes_json): raise Exception("build_cloud_with_json: Can't find " + h2o_nodes_json + " file") ## h2o_os_util.show_h2o_processes() with open(h2o_nodes_json, 'rb') as f: cloneJson = json.load(f) # These are supposed to be in the file. # Just check the first one. if not there, the file must be wrong if not 'cloud_start' in cloneJson: raise Exception("Can't find 'cloud_start' in %s, wrong file? h2o-nodes.json?" % h2o_nodes_json) else: cs = cloneJson['cloud_start'] print "Info on the how the cloud we're cloning was started (info from %s)" % h2o_nodes_json # required/legal values in 'cloud_start'. A robust check is good for easy debug when we add stuff valList = ['time', 'cwd', 'python_test_name', 'python_cmd_line', 'config_json', 'username', 'ip'] for v in valList: if v not in cs: raise Exception("Can't find %s in %s, wrong file or version change?" % (v, h2o_nodes_json)) print "cloud_start['%s']: %s" % (v, cs[v]) # this is the internal node state for python..nodes rebuild nodeStateList = cloneJson['h2o_nodes'] nodeList = [] if not nodeStateList: raise Exception("nodeStateList is empty. %s file must be empty/corrupt" % h2o_nodes_json) try: for nodeState in nodeStateList: print "Cloning state for node", nodeState['node_id'], 'from', h2o_nodes_json newNode = ExternalH2O(nodeState) nodeList.append(newNode) # If it's an existing cloud, it may already be locked. so never check. # we don't have the cloud name in the -ccj since it may change (and the file be static?) # so don't check expectedCloudName verify_cloud_size(nodeList, expectedCloudName=None, expectedLocked=None) # best to check for any errors right away? # (we won't report errors from prior tests due to marker stuff? ## check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() except: # nodeList might be empty in some exception cases? # no shutdown issued first, though ## if cleanup and nodeList: ## for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_nodes_json, "we used to", LOG_DIR shutil.copy(h2o_nodes_json, LOG_DIR + "/" + os.path.basename(h2o_nodes_json)) print "" h2p.red_print("Ingested from json:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes") print "" # save it to a global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs): super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint) print self.withinss # per cluster print self.totss print self.tot_withinss print self.betweenss # should model builder add this to the kmeansResult? if 'python_elapsed' in kmeansResult: self.python_elapsed = kmeansResult['python_elapsed'] size = self.size # [78, 5, 41, 76] model_category = self.model_category # Clustering iterations = self.iterations # 11.0 domains = self.domains names = self.names categorical_column_count = self.categorical_column_count # 0 centers_data = self.centers.data # [ 4 lists of centers ] # h2o returns it sliced across centers. transpose the list of lists, drop 0 which is the cluster id? # gotta turn the strings into numbers centersStr = [list(x) for x in zip(*centers_data[1:])] centers = [map(float, c) for c in centersStr] withinss = self.withinss totss = self.totss if numRows: assert numRows==sum(size) if 'k' in parameters: k = parameters['k'] assert len(centers) == k assert len(size) == k if numCols: assert len(names) == numCols, \ "Need to pass correct numCols after ignored columns decrement %s %s %s" % (len(names), numCols, names) for c in centers: assert len(c) == numCols, "%s %s" % (len(c), numCols) # this should be true if labels: assert len(labels) == numCols, \ "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols) assert len(labels) == len(names), \ "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names)) assert labels == names if 'max_iterations' in parameters: max_iterations = parameters['max_iterations'] assert max_iterations >= iterations # we could check the centers are within the min/max of each column for i,c in enumerate(centers): for n in c: if math.isnan(float(n)): raise Exception("cluster", i, "has NaN:", n, "center:", c) # create a tuple for each cluster result, then sort by rows for easy comparison # maybe should sort by centers? # put a cluster index in there too, (leftmost) so we don't lose track tuples = zip(range(len(centers)), centers, size, withinss) # print "tuples:", dump_json(tuples) # can we sort on the sum of the centers? self.tuplesSorted = sorted(tuples, key=lambda tup: sum(tup[1])) print "iterations:", iterations # undo for printing what the caller will see ids, centers, size, withinss = zip(*self.tuplesSorted) for i,c in enumerate(centers): print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c) print "rows_per_cluster[%s]: " % i, size[i] print "withinss[%s]: " % i, withinss[i] print "size[%s]:" % i, size[i] print "KMeansObj created for:", "???"# vars(self) # shouldn't have any errors check_sandbox_for_errors()
def simpleCheckPCA(self, pca, **kwargs): #print dump_json(pca) warnings = None if 'warnings' in pca: warnings = pca['warnings'] # catch the 'Failed to converge" for now x = re.compile("[Ff]ailed") for w in warnings: print "\nwarning:", w if re.search(x, w): raise Exception(w) # Check other things in the json response dictionary 'pca' here pcaResult = pca verboseprint('pcaResult Inspect:', dump_json(pcaResult)) #Check no NaN in sdevs, propVars, or in PCs print "Checking sdevs..." sdevs = pcaResult["pca_model"]["sdev"] verboseprint("pca sdevs:", dump_json(sdevs)) # sdevs is supposed to be a list sorted by s # sFirst = sdevs[0].s for PC, s in enumerate(sdevs): if math.isnan(s): raise Exception("sdev %s is NaN: %s" % (PC, s)) # anqi says the list should be sorted..i.e. first first ## if s < sFirst: ## raise Exception("sdev %s %s is > sFirst %s. Supposed to be sorted?" % (PC, s, sFirst)) print "Checking propVars...", propVars = pcaResult["pca_model"]["propVar"] verboseprint("pca propVars:", dump_json(propVars)) for PC, propvar in enumerate(propVars): if math.isnan(propvar): raise Exception("propVar %s is NaN: %s", (PC, propvar)) print " Good!" print "Checking eigVec...", pcs = pcaResult["pca_model"]["eigVec"] verboseprint("pca eigVec:", dump_json(pcs)) for i, s in enumerate(pcs): for r, e in enumerate(s): if math.isnan(e): raise Exception("Component %s has NaN: %s eigenvector %s", (i, e, s)) print " Good!" print "How many components did we get? (after enum col dropping): %s" % len( pcs) # now print the top ten. Sorting by the value...getting key,value tuples (so we can see the column) # it should match the column numbering..even if it skips cols due to enums import operator print "Just look at the sort for the first row in pca eigVec" i = 0 s = pcs[i] # print "s:", s unsorted_s = [(i, j) for i, j in enumerate(s)] sorted_s = sorted(unsorted_s, key=lambda t: abs(t[1]), reverse=True) print "\n%s First (larger). sorted_s: %s\n" % (i, sorted_s) print "The last entry from the eigenvector, should have the largest std dev, because it's sorted" print "Rule of thumb is we can then look at the sorted values, and guess it's related to column importance" print "The sort should be on the abs(), since the signs can be + or -" # shouldn't have any errors check_sandbox_for_errors() return warnings
def runExec(node=None, timeoutSecs=20, **kwargs): if not node: node = h2o_nodes.nodes[0] # no such thing as GLMView..don't use retryDelaySecs a = node.exec_query(timeoutSecs, **kwargs) check_sandbox_for_errors() return a
def pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=5): busy = True trials = 0 start = time.time() polls = 0 statSum = {} # just init for worst case 64 nodes? lastUsedMemBytes = [1 for i in range(64)] while busy: polls += 1 # get utilization and print it # any busy jobs a = h2o_nodes.nodes[0].jobs(timeoutSecs=60) busy = False for j in a['jobs']: msec = j.get('msec', None) if j['status']!='DONE': busy = True verboseprint("Still busy") break cloudStatus = h2o_nodes.nodes[0].get_cloud(timeoutSecs=timeoutSecs) nodes = cloudStatus['nodes'] for i,n in enumerate(nodes): # check for drop in tot_mem_bytes, and report as "probably post GC" totMemBytes = n['tot_mem_bytes'] maxMemBytes = n['max_mem_bytes'] freeMemBytes = n['free_mem_bytes'] usedMemBytes = totMemBytes - freeMemBytes availMemBytes = maxMemBytes - usedMemBytes print 'Node %s:' % i, \ 'num_cpus:', n['num_cpus'],\ 'my_cpu_%:', n['my_cpu_%'],\ 'sys_cpu_%:', n['sys_cpu_%'],\ 'system_load:', n['system_load'],\ 'tot_mem_bytes: {:,}'.format(totMemBytes),\ 'max_mem_bytes: {:,}'.format(maxMemBytes),\ 'free_mem_bytes: {:,}'.format(freeMemBytes),\ 'usedMemBytes: {:,}'.format(usedMemBytes) decrease = round((0.0 + lastUsedMemBytes[i] - usedMemBytes) / lastUsedMemBytes[i], 3) if decrease > .05: print print "\nProbably GC at Node {:}: usedMemBytes decreased by {:f} pct.. {:,} {:,}".format(i, 100 * decrease, lastUsedMemBytes[i], usedMemBytes) lastUsedMemBytes[i] = usedMemBytes # don't update lastUsedMemBytes if we're decreasing if usedMemBytes > lastUsedMemBytes[i]: lastUsedMemBytes[i] = usedMemBytes # sum all individual stats for stat in n: if stat in statSum: try: statSum[stat] += n[stat] except TypeError: # raise Exception("statSum[stat] should be number %s %s" % (statSum[stat], stat, n[stat])) print "ERROR: statSum[stat] should be number %s %s %s" % (statSum[stat], stat, n[stat]) # do nothing else: try: statSum[stat] = n[stat] + 0.0 except TypeError: pass # ignore non-numbers trials += 1 if trials%5 == 0: check_sandbox_for_errors() time.sleep(retryDelaySecs) if not h2o_args.no_timeout and ((time.time() - start) > timeoutSecs): raise Exception("Timeout while polling in pollStatsWhileBusy: %s seconds" % timeoutSecs) # now print man print "Did %s polls" % polls statMean = {} print "Values are summed across all nodes (cloud members), so divide by node count" for s in statSum: statMean[s] = round((statSum[s] + 0.0) / polls, 2) print "per poll mean", s + ':', statMean[s] return statMean
def pollWaitJobs(pattern=None, errorIfCancelled=False, timeoutSecs=60, pollTimeoutSecs=60, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=None): wait = True waitTime = 0 ignoredJobs = set() while (wait): a = h2o_nodes.nodes[0].jobs(timeoutSecs=pollTimeoutSecs) verboseprint("jobs():", dump_json(a)) jobs = a['jobs'] busy = 0 for j in jobs: cancelled = j['status']=='CANCELLED' description = j['description'] key = j['key'] jobKey = key['name'] jobKeyType = key['type'] # "key": { # "URL": "/3/Jobs.json/$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2", # "__meta": { # "schema_name": "JobKeyV1", # "schema_type": "Key<Job>", # "schema_version": 1 # }, # "name": "$0301c0a8002232d4ffffffff$_95036c2ef3f74468c63861fd826149c2", # "type": "Key<Job>" # progress = j['progress'] progress_msg = j['progress_msg'] # has exception and val? start_time = j['start_time'] end_time = j.get('end_time', None) dest = j['dest'] description = j['description'] msec = j.get('msec', None) # for now, don't ignore any exceptions # FIX! what do exceptions look like now? if 'exception' in j and j['exception']: check_sandbox_for_errors() msg = "ERROR: pollWaitJobs found a job with a exception result when it shouldn't have:\n %s" % dump_json(j) raise Exception(msg) if errorIfCancelled and cancelled: check_sandbox_for_errors() print ("ERROR: not stopping, but: pollWaitJobs found a cancelled job when it shouldn't have:\n %s" % dump_json(j)) print ("Continuing so maybe a json response will give more info") ### verboseprint(j) # don't include cancelled jobs here elif j['status']!='DONE': if not pattern: # always print progress if busy job (no pattern used print "time:", time.strftime("%I:%M:%S"), "progress:", progress, dest verboseprint("description:", description, "end_time:", end_time) busy +=1 verboseprint("pollWaitJobs: found a busy job, now: %s" % busy) else: if (pattern in key) or (pattern in dest) or (pattern in description): ## print "description:", description, "end_time:", end_time busy += 1 verboseprint("pollWaitJobs: found a pattern-matched busy job, now %s" % busy) # always print progress if pattern is used and matches print "time:", time.strftime("%I:%M:%S"), "progress:", progress, dest # we only want to print the warning message once elif key not in ignoredJobs: jobMsg = "%s %s %s" % (key, description, dest) verboseprint(" %s job in progress but we're ignoring it. Doesn't match pattern." % jobMsg) # I guess "key" is supposed to be unique over all time for a job id? ignoredJobs.add(key) if stallForNJobs: waitFor = stallForNJobs else: waitFor = 0 print " %s jobs in progress." % busy, "Waiting until %s in progress." % waitFor wait = busy > waitFor if not wait: break ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs") if not h2o_args.no_timeout and (wait and waitTime > timeoutSecs): print dump_json(jobs) raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds") sys.stdout.write('.') sys.stdout.flush() time.sleep(retryDelaySecs) waitTime += retryDelaySecs # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack) # test would pass ['cpu','disk','jstack'] kind of list if benchmarkLogging: h2o.cloudPerfH2O.get_log_save(benchmarkLogging) # check the sandbox for stack traces! just like we do when polling normally check_sandbox_for_errors() patternKeys = [] for j in jobs: # save the destination keys in progress that match pattern (for returning) if pattern and pattern in j['dest']: patternKeys.append(j['dest']) return patternKeys
def build_cloud(node_count=1, base_port=None, hosts=None, timeoutSecs=30, retryDelaySecs=1, cleanup=True, rand_shuffle=True, conservative=False, create_json=False, clone_cloud=None, init_sandbox=True, usecloud=False, usecloud_size=None, **kwargs): # expectedSize is only used if usecloud # usecloud can be passed thru build_cloud param, or command line # not in config json though so no build_cloud_with_hosts path. # redirect to build_cloud_with_json if a command line arg # wants to force a test to ignore it's build_cloud/build_cloud_with_hosts # (both come thru here) # clone_cloud is just another way to get the effect (maybe ec2 config file thru # build_cloud_with_hosts? global stdout_wrapped if not h2o_args.disable_time_stamp and not stdout_wrapped: sys.stdout = OutWrapper(sys.stdout) stdout_wrapped = True if h2o_args.usecloud or usecloud: # for now, just have fixed name in local file. (think of this as a temp or debug file) # eventually we'll pass the json object instead for speed? nodesJsonPathname = "h2o_fc-nodes.json" elif h2o_args.clone_cloud_json: nodesJsonPathname = h2o_args.clone_cloud_json elif clone_cloud: nodesJsonPathname = clone_cloud else: # normal build_cloud() doesn't use nodesJsonPathname = None # usecloud dominates over all if (h2o_args.clone_cloud_json or clone_cloud) or (h2o_args.usecloud or usecloud): # then build_cloud_with_json with json object # we don't need to specify these defaults, but leave here to show that we can pass # I suppose kwargs will have it if h2o_args.usecloud: ip_port = h2o_args.usecloud elif usecloud: ip_port = usecloud else: ip_port = None # h2o_args dominates if h2o_args.usecloud_size: # only used for expected size useCloudExpectedSize = h2o_args.usecloud_size else: useCloudExpectedSize = usecloud_size nodesJsonObject = h2o_fc.find_cloud(ip_port=ip_port, expectedSize=useCloudExpectedSize, nodesJsonPathname=nodesJsonPathname, **kwargs) # potentially passed in kwargs # hdfs_version='cdh4', hdfs_config=None, hdfs_name_node='172.16.1.176', nodeList = build_cloud_with_json(h2o_nodes_json=nodesJsonPathname) return nodeList # else # moved to here from unit_main. so will run with nosetests too! # Normally do this. # Don't if build_cloud_with_hosts() did and put a flatfile in there already! if init_sandbox: clean_sandbox() log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud() ") log("#*********************************************************************") # start up h2o to report the java version (once). output to python stdout # only do this for regression testing # temporarily disable this, to go a little faster # if getpass.getuser() == 'jenkins': # check_h2o_version() ports_per_node = 2 nodeList = [] # shift the port used to run groups of tests on the same machine at the same time? base_port = get_base_port(base_port) try: # if no hosts list, use psutil method on local host. totalNodes = 0 # doing this list outside the loops so we can shuffle for better test variation # this jvm startup shuffle is independent from the flatfile shuffle portList = [base_port + ports_per_node * i for i in range(node_count)] if hosts is None: # if use_flatfile, we should create it # because tests will just call build_cloud with use_flatfile=True # best to just create it all the time..may or may not be used write_flatfile(node_count=node_count, base_port=base_port) hostCount = 1 if rand_shuffle: random.shuffle(portList) for p in portList: verboseprint("psutil starting node", i) newNode = LocalH2O(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 else: # if hosts, the flatfile was created and uploaded to hosts already # I guess don't recreate it, don't overwrite the one that was copied beforehand. # we don't always use the flatfile (use_flatfile=False) # Suppose we could dispatch from the flatfile to match it's contents # but sometimes we want to test with a bad/different flatfile then we invoke h2o? hostCount = len(hosts) hostPortList = [] for h in hosts: for port in portList: hostPortList.append((h, port)) if rand_shuffle: random.shuffle(hostPortList) for (h, p) in hostPortList: verboseprint('ssh starting node', totalNodes, 'via', h) newNode = h.remote_h2o(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 verboseprint("Attempting Cloud stabilize of", totalNodes, "nodes on", hostCount, "hosts") start = time.time() # UPDATE: best to stabilize on the last node! stabilize_cloud(nodeList[0], nodeList, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, noSandboxErrorCheck=True) verboseprint(len(nodeList), "Last added node stabilized in ", time.time() - start, " secs") verboseprint("Built cloud: %d nodes on %d hosts, in %d s" % \ (len(nodeList), hostCount, (time.time() - start))) h2p.red_print("Built cloud:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes") # FIX! using "consensus" in node[-1] should mean this is unnecessary? # maybe there's a bug. For now do this. long term: don't want? # UPDATE: do it for all cases now 2/14/13 if conservative: # still needed? for n in nodeList: stabilize_cloud(n, nodeList, timeoutSecs=timeoutSecs, noSandboxErrorCheck=True) # this does some extra checking now # verifies cloud name too if param is not None verify_cloud_size(nodeList, expectedCloudName=nodeList[0].cloud_name) # best to check for any errors due to cloud building right away? check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) except: # nodeList might be empty in some exception cases? # no shutdown issued first, though if cleanup and nodeList: for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise print len(nodeList), "total jvms in H2O cloud" # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() if h2o_args.config_json: LOG_DIR = get_sandbox_name() # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_args.config_json, "we used to", LOG_DIR shutil.copy(h2o_args.config_json, LOG_DIR + "/" + os.path.basename(h2o_args.config_json)) # Figure out some stuff about how this test was run cs_time = str(datetime.datetime.now()) cs_cwd = os.getcwd() cs_python_cmd_line = "python %s %s" % (h2o_args.python_test_name, h2o_args.python_cmd_args) cs_python_test_name = h2o_args.python_test_name if h2o_args.config_json: cs_config_json = os.path.abspath(h2o_args.config_json) else: cs_config_json = None cs_username = h2o_args.python_username cs_ip = h2o_args.python_cmd_ip # dump the nodes state to a json file # include enough extra info to have someone # rebuild the cloud if a test fails that was using that cloud. if create_json: q = { 'cloud_start': { 'time': cs_time, 'cwd': cs_cwd, 'python_test_name': cs_python_test_name, 'python_cmd_line': cs_python_cmd_line, 'config_json': cs_config_json, 'username': cs_username, 'ip': cs_ip, }, 'h2o_nodes': h2o_util.json_repr(nodeList), } with open('h2o-nodes.json', 'w+') as f: f.write(json.dumps(q, indent=4)) # save it to a local global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def build_cloud(node_count=1, base_port=None, hosts=None, timeoutSecs=30, retryDelaySecs=1, cleanup=True, rand_shuffle=True, conservative=False, create_json=False, clone_cloud=None, init_sandbox=True, usecloud=False, usecloud_size=None, **kwargs): # expectedSize is only used if usecloud # usecloud can be passed thru build_cloud param, or command line # not in config json though so no build_cloud_with_hosts path. # redirect to build_cloud_with_json if a command line arg # wants to force a test to ignore it's build_cloud/build_cloud_with_hosts # (both come thru here) # clone_cloud is just another way to get the effect (maybe ec2 config file thru # build_cloud_with_hosts? global stdout_wrapped if not h2o_args.disable_time_stamp and not stdout_wrapped: sys.stdout = OutWrapper(sys.stdout) stdout_wrapped = True if h2o_args.usecloud or usecloud: # for now, just have fixed name in local file. (think of this as a temp or debug file) # eventually we'll pass the json object instead for speed? nodesJsonPathname = "h2o_fc-nodes.json" elif h2o_args.clone_cloud_json: nodesJsonPathname = h2o_args.clone_cloud_json elif clone_cloud: nodesJsonPathname = clone_cloud else: # normal build_cloud() doesn't use nodesJsonPathname = None # usecloud dominates over all if (h2o_args.clone_cloud_json or clone_cloud) or (h2o_args.usecloud or usecloud): # then build_cloud_with_json with json object # we don't need to specify these defaults, but leave here to show that we can pass # I suppose kwargs will have it if h2o_args.usecloud: ip_port = h2o_args.usecloud elif usecloud: ip_port = usecloud else: ip_port = None # h2o_args dominates if h2o_args.usecloud_size: # only used for expected size useCloudExpectedSize = h2o_args.usecloud_size else: useCloudExpectedSize = usecloud_size if (h2o_args.usecloud or usecloud): nodesJsonObject = h2o_fc.find_cloud(ip_port=ip_port, expectedSize=useCloudExpectedSize, nodesJsonPathname=nodesJsonPathname, **kwargs) # potentially passed in kwargs # hdfs_version='cdh4', hdfs_config=None, hdfs_name_node='172.16.1.176', else: if h2o_args.clone_cloud_json: nodesJsonPathname = h2o_args.clone_cloud_json else: nodesJsonPathname = clone_cloud nodeList = build_cloud_with_json(h2o_nodes_json=nodesJsonPathname) return nodeList # else # moved to here from unit_main. so will run with nosetests too! # Normally do this. # Don't if build_cloud_with_hosts() did and put a flatfile in there already! if init_sandbox: clean_sandbox() log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud() ") log("#*********************************************************************") # start up h2o to report the java version (once). output to python stdout # only do this for regression testing # temporarily disable this, to go a little faster # if getpass.getuser() == 'jenkins': # check_h2o_version() ports_per_node = 2 nodeList = [] # shift the port used to run groups of tests on the same machine at the same time? base_port = get_base_port(base_port) try: # if no hosts list, use psutil method on local host. totalNodes = 0 # doing this list outside the loops so we can shuffle for better test variation # this jvm startup shuffle is independent from the flatfile shuffle portList = [base_port + ports_per_node * i for i in range(node_count)] if hosts is None: # if use_flatfile, we should create it # because tests will just call build_cloud with use_flatfile=True # best to just create it all the time..may or may not be used write_flatfile(node_count=node_count, base_port=base_port) hostCount = 1 if rand_shuffle: random.shuffle(portList) for p in portList: verboseprint("psutil starting node", i) newNode = LocalH2O(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 else: # if hosts, the flatfile was created and uploaded to hosts already # I guess don't recreate it, don't overwrite the one that was copied beforehand. # we don't always use the flatfile (use_flatfile=False) # Suppose we could dispatch from the flatfile to match it's contents # but sometimes we want to test with a bad/different flatfile then we invoke h2o? hostCount = len(hosts) hostPortList = [] for h in hosts: for port in portList: hostPortList.append((h, port)) if rand_shuffle: random.shuffle(hostPortList) for (h, p) in hostPortList: verboseprint('ssh starting node', totalNodes, 'via', h) newNode = h.remote_h2o(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 verboseprint("Attempting Cloud stabilize of", totalNodes, "nodes on", hostCount, "hosts") start = time.time() # UPDATE: best to stabilize on the last node! # FIX! for now, always check sandbox, because h2oddev has TIME_WAIT port problems stabilize_cloud(nodeList[0], nodeList, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, noExtraErrorCheck=False) stabilizeTime = time.time() - start verboseprint(len(nodeList), "Last added node stabilized in ", stabilizeTime, " secs") # assume all the heap sizes are the same as zero if nodeList[0].java_heap_GB: heapSize = str(nodeList[0].java_heap_GB) + " GB" elif nodeList[0].java_heap_GB: heapSize = str(nodeList[0].java_heap_MB) + " MB" else: heapSize = "(unknown)" h2p.red_print("Built cloud: %s java heap(s) with %d nodes on %d hosts, stabilizing in %d secs" % \ (heapSize, len(nodeList), hostCount, stabilizeTime)) # FIX! using "consensus" in node[-1] should mean this is unnecessary? # maybe there's a bug. For now do this. long term: don't want? # UPDATE: do it for all cases now 2/14/13 if conservative: # still needed? for n in nodeList: # FIX! for now, always check sandbox, because h2oddev has TIME_WAIT port problems stabilize_cloud(n, nodeList, timeoutSecs=timeoutSecs, noExtraErrorCheck=False) # this does some extra checking now # verifies cloud name too if param is not None verify_cloud_size(nodeList, expectedCloudName=nodeList[0].cloud_name, expectedLocked=0) # FIX! should probably check that the cloud's lock=0. It will go to 1 later. # but if it's an existing cloud, it may already be locked. # That will be in build_cloud_with_json, though # best to check for any errors due to cloud building right away? check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() except: # nodeList might be empty in some exception cases? # no shutdown issued first, though if cleanup and nodeList: for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise print len(nodeList), "total jvms in H2O cloud" if h2o_args.config_json: # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_args.config_json, "we used to", LOG_DIR shutil.copy(h2o_args.config_json, LOG_DIR + "/" + os.path.basename(h2o_args.config_json)) if create_json: # Figure out some stuff about how this test was run cs_time = str(datetime.datetime.now()) cs_cwd = os.getcwd() cs_python_cmd_line = "python %s %s" % (h2o_args.python_test_name, h2o_args.python_cmd_args) cs_python_test_name = h2o_args.python_test_name if h2o_args.config_json: cs_config_json = os.path.abspath(h2o_args.config_json) else: cs_config_json = None cs_username = h2o_args.python_username cs_ip = h2o_args.python_cmd_ip # dump the nodes state to a json file # include enough extra info to have someone # rebuild the cloud if a test fails that was using that cloud. q = { 'cloud_start': { 'time': cs_time, 'cwd': cs_cwd, 'python_test_name': cs_python_test_name, 'python_cmd_line': cs_python_cmd_line, 'config_json': cs_config_json, 'username': cs_username, 'ip': cs_ip, }, 'h2o_nodes': h2o_util.json_repr(nodeList), } with open('h2o-nodes.json', 'w+') as f: f.write(json.dumps(q, indent=4)) # save it to a local global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs): # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params2 = params.copy() for k in params2: if params2[k] is None: del params[k] paramsStr = '?' + '&'.join( ['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest( "----------------------------------------------------------------------\n" ) if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if cmd == 'post': r = requests.post(url, timeout=timeout, params=params, **kwargs) else: r = requests.get(url, timeout=timeout, params=params, **kwargs) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) check_sandbox_for_errors( python_test_name=h2o_args.python_test_name) log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise exc_info[1], None, exc_info[2]
def simpleCheckKMeans(self, modelResult, parameters, numRows, numCols, labels): # labels should have the ignored columns removed # numCols should be decremented by the ignored columns # the names order should then match the labels order output = modelResult['models'][0]['output'] # print "model output:", dump_json(output) # find out what results we get ko = KMeansOutput(output) if 1==0: for attr, value in ko.__dict__.iteritems(): # create some python prints to use print "%s = ko.%s # %s" % (attr, attr, value) # these should sum to the rows in the dataset rows = ko.rows # [78, 5, 41, 76] model_category = ko.model_category # Clustering iters = ko.iters # 11.0 schema_version = ko.schema_version # 2 domains = ko.domains # [None, None, None, None, None, None, None, None, None, None, None, None, None, None] # names = ko.names # [u'STR', u'OBS', u'AGMT', u'FNDX', u'HIGD', u'DEG', u'CHK', u'AGP1', u'AGMN', u'NLV', u'LIV', u'WT', u'AGLP', u'MST'] schema_name = ko.schema_name # KMeansModelOutputV2 schema_type = ko.schema_type # KMeansOutput ncats = ko.ncats # 0 clusters = ko.clusters # [ 4 lists of centers ] mse = ko.mse # 505.632581773 mses = ko.mses # [476.37866653867707, 563.7343365736649, 606.3007046232348, 477.5260498976912] if numRows: assert numRows==sum(rows) if 'K' in parameters: K = parameters['K'] assert len(mses) == K assert len(clusters) == K assert len(rows) == K if numCols: assert len(names) == numCols, \ "Need to pass correct numCols after ignored columns decrement %s %s" % (len(names), numCols) for c in clusters: assert len(c) == numCols, "%s %s" % (len(c), numCols) # this should be true if labels: assert len(labels) == numCols, \ "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols) assert len(labels) == len(names), \ "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names)) assert labels == names if 'max_iters' in parameters: max_iters = parameters['max_iters'] assert max_iters >= iters # we could check the centers are within the min/max of each column for i,c in enumerate(clusters): for n in c: if math.isnan(float(n)): raise Exception("cluster", i, "has NaN:", n, "center:", c) # shouldn't have any errors check_sandbox_for_errors() # create a tuple for each cluster result, then sort by rows for easy comparison # maybe should sort by centers? # put a cluster index in there too, (leftmost) so we don't lose teack tuples = zip(range(len(clusters)), mses, rows, clusters) tuplesSorted = sorted(tuples, key=itemgetter(3)) # undo for printing what the caller will see ids, mses, rows, clusters = zip(*tuplesSorted) print "\nmse:", mse print "iters:", iters print "ids:", ids print "mses:", mses print "rows:", rows for i,c in enumerate(clusters): print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c) # to unzip the tuplesSorted. zip with * # ids, mses, rows, clusters = zip(*tuplesSorted) return tuplesSorted, iters, mse, names
def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs): super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint) print self.withinss # per cluster print self.totss print self.tot_withinss print self.betweenss # should model builder add this to the kmeansResult? if 'python_elapsed' in kmeansResult: self.python_elapsed = kmeansResult['python_elapsed'] size = self.size # [78, 5, 41, 76] model_category = self.model_category # Clustering iterations = self.iterations # 11.0 domains = self.domains names = self.names categorical_column_count = self.categorical_column_count # 0 centers_data = self.centers.data # [ 4 lists of centers ] # h2o returns it sliced across centers. transpose the list of lists, drop 0 which is the cluster id? # gotta turn the strings into numbers centersStr = [list(x) for x in zip(*centers_data[1:])] centers = [map(float, c) for c in centersStr] withinss = self.withinss totss = self.totss if numRows: assert numRows == sum(size) if 'k' in parameters: k = parameters['k'] assert len(centers) == k assert len(size) == k if numCols: assert len(names) == numCols, \ "Need to pass correct numCols after ignored columns decrement %s %s %s" % (len(names), numCols, names) for c in centers: assert len(c) == numCols, "%s %s" % (len(c), numCols) # this should be true if labels: assert len(labels) == numCols, \ "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols) assert len(labels) == len(names), \ "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names)) assert labels == names if 'max_iterations' in parameters: max_iterations = parameters['max_iterations'] assert max_iterations >= iterations # we could check the centers are within the min/max of each column for i, c in enumerate(centers): for n in c: if math.isnan(float(n)): raise Exception("cluster", i, "has NaN:", n, "center:", c) # create a tuple for each cluster result, then sort by rows for easy comparison # maybe should sort by centers? # put a cluster index in there too, (leftmost) so we don't lose track tuples = zip(range(len(centers)), centers, size, withinss) # print "tuples:", dump_json(tuples) # can we sort on the sum of the centers? self.tuplesSorted = sorted(tuples, key=lambda tup: sum(tup[1])) print "iterations:", iterations # undo for printing what the caller will see ids, centers, size, withinss = zip(*self.tuplesSorted) for i, c in enumerate(centers): print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c) print "rows_per_cluster[%s]: " % i, size[i] print "withinss[%s]: " % i, withinss[i] print "size[%s]:" % i, size[i] print "KMeansObj created for:", "???" # vars(self) # shouldn't have any errors check_sandbox_for_errors()
def simpleCheckGLM( self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, allowNaN=False, ): # FIX! the structure is all different return warnings = "" # binomial = model.binomial residual_deviance = model.training_metrics.residual_deviance threshold = model.training_metrics.threshold check_obj_has_good_numbers(threshold, "threshold", allowNaN=allowNaN) auc = model.AUC # NaN if not logistic # check_obj_has_good_numbers(auc, 'model.AUC') best_lambda_idx = model.best_lambda_idx model_category = model.model_category name = model.name residual_degrees_of_freedom = model.residual_degrees_of_freedom # is this no longer used? coefficients_magnitude = model.coefficients_magnitude null_deviance = model.null_deviance check_obj_has_good_numbers(null_deviance, "model.null_deviance", allowNaN=allowNaN) null_degrees_of_freedom = model.null_degrees_of_freedom check_obj_has_good_numbers(null_degrees_of_freedom, "model.null_degrees_of_freedom", allowNaN=allowNaN) domains = model.domains # when is is this okay to be NaN? AIC = model.AIC check_obj_has_good_numbers(AIC, "model.AIC", allowNaN=allowNaN) names = model.names coeffs_names = model.coefficients_table.data[0] # these are returned as quoted strings. Turn them into numbers temp = model.coefficients_table.data[1] assert len(coeffs_names) == len(temp), "%s %s" % (len(coeffs_names), len(temp)) # we need coefficients to be floats or empty check_obj_has_good_numbers(temp, "model.coeffs", allowNaN=False) # print "temp", temp[0:10] # print "temp[5489:5500]", temp[5489:5500] # UPDATE: None (null json) is legal for coeffs coeffs = map(lambda x: float(x) if (x is not None and str(x) != "") else 0, temp) intercept = coeffs[-1] interceptName = coeffs_names[-1] assert interceptName == "Intercept" assert len(coeffs) == len(coeffs_names), "%s %s" % (len(coeffs), len(coeffs_names)) # FIX! if a coeff is zeroed/ignored, it doesn't show up? # get rid of intercept in glm response # assert (len(coeffs)-1) == len(labelListUsed, \ # "%s %s %s %s" % (len(coeffs), len(labelListUsed), coeffs, labelListUsed) # labelList still has the response column? # ignored columns aren't in model.names, but output response is. # labelListUsed has the response col removed so add 1 # Hmm..dropped coefficients again? can't do this check? # assert len(model.names) == len(labelListUsed), \ # "%s %s %s %s" % (len(model.names), len(labelListUsed), model.names, labelList) # this is no longer true! # assert model.threshold!=0 print "len(coeffs)", len(coeffs) print "coeffs:", coeffs # last one is intercept if interceptName != "Intercept" or abs(intercept) < 1e-26: raise Exception("'Intercept' should be last in coeffs_names %s %s" % (interceptName, intercept)) y = parameters["response_column"] cString = "\n" for i, c in enumerate(coeffs_names): cString += "%s: %.5e " % (coeffs_names[i], coeffs[i]) print cString print "\nH2O intercept:\t\t%.5e" % intercept print "\nTotal # of coeffs:", len(coeffs_names) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater( absIntercept, 1e-26, ( "abs. value of GLM coeffs['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "parameters:" + dump_json(parameters) ), ) if (not allowZeroCoeff) and (len(coeffs) > 1): s = 0.0 for c in coeffs: s += abs(float(c)) self.assertGreater( s, 1e-26, ( "sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" + "parameters:" + dump_json(parameters) ), ) # shouldn't have any errors check_sandbox_for_errors() return (warnings, coeffs, intercept)
def simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False): warnings = '' intercept = model.global_beta[-1] interceptName = model.coefficient_names[-1] coeffs = model.global_beta[:-1] coeffs_names = model.coefficient_names[:-1] assert len(coeffs) == (len(model.coefficient_names) - 1) assert len(coeffs) == len(labelListUsed), "%s %s" % (coeffs, labelListUsed) # labelList still has the response column? # ignored columns aren't in model.names, but output response is. # labelListUsed has the response col removed so add 1 assert len(model.names) == (len(labelListUsed) + 1), "%s %s" % (model.names, labelList) assert model.threshold != 0 print "len(coeffs)", len(coeffs) print "coeffs:", coeffs # last one is intercept if interceptName != "Intercept" or abs(intercept) < 1e-26: raise Exception( "'Intercept' should be last in coefficient_names and global_beta %s %s" % (interceptName, intercept)) y = parameters['response_column'] cString = "\n" for i, c in enumerate(coeffs_names): cString += "%s: %.5e " % (coeffs_names[i], coeffs[i]) print cString print "\nH2O intercept:\t\t%.5e" % intercept print "\nTotal # of coeffs:", len(coeffs_names) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ("abs. value of GLM coeffs['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "parameters:" + dump_json(parameters))) if (not allowZeroCoeff) and (len(coeffs) > 1): s = 0.0 for c in coeffs: s += abs(float(c)) self.assertGreater( s, 1e-26, ("sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" + "parameters:" + dump_json(parameters))) # shouldn't have any errors check_sandbox_for_errors() return (warnings, coeffs, intercept)
def oldSimpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs): # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter # h2o GLM will verboseprint the result and print errors. # so don't have to do that # different when cross validation is used? No trainingErrorDetails? GLMModel = glm['glm_model'] if not GLMModel: raise Exception("GLMModel didn't exist in the glm response? %s" % dump_json(glm)) warnings = None if 'warnings' in GLMModel and GLMModel['warnings']: warnings = GLMModel['warnings'] # stop on failed x = re.compile("failed", re.IGNORECASE) # don't stop if fail to converge c = re.compile("converge", re.IGNORECASE) for w in warnings: print "\nwarning:", w if re.search(x,w) and not allowFailWarning: if re.search(c,w): # ignore the fail to converge warning now pass else: # stop on other 'fail' warnings (are there any? fail to solve? raise Exception(w) # for key, value in glm.iteritems(): print key # not in GLMGrid? # FIX! don't get GLMParams if it can't solve? GLMParams = GLMModel['glm'] family = GLMParams["family"] # number of submodels = number of lambda # min of 2. lambda_max is first submodels = GLMModel['submodels'] # since all our tests?? only use one lambda, the best_lamda_idx should = 1 best_lambda_idx = GLMModel['best_lambda_idx'] print "best_lambda_idx:", best_lambda_idx lambda_max = GLMModel['lambda_max'] print "lambda_max:", lambda_max # currently lambda_max is not set by tomas. ..i.e.not valid if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value): raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value)) # submodels0 = submodels[0] # submodels1 = submodels[-1] # hackery to make it work when there's just one if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels))) if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels))) submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one iterations = submodels1['iteration'] print "GLMModel/iterations:", iterations # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter if maxExpectedIterations is not None and iterations > maxExpectedIterations: raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) ) if 'validation' not in submodels1: raise Exception("Should be a 'validation' key in submodels1: %s" % dump_json(submodels1)) validationsList = submodels1['validation'] validations = validationsList # xval. compare what we asked for and what we got. n_folds = kwargs.setdefault('n_folds', None) print "GLMModel/validations" validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance']) validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance']) print "%15s %s" % ("null_deviance:\t", validations['null_deviance']) print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance']) # threshold only there if binomial? # auc only for binomial if family=="binomial": print "%15s %s" % ("auc:\t", validations['auc']) best_threshold = validations['best_threshold'] thresholds = validations['thresholds'] print "%15s %s" % ("best_threshold:\t", best_threshold) # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t >= best_threshold: # ends up using next one if not present best_index = i break assert best_index!=None, "%s %s" % (best_threshold, thresholds) print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1] submodels = glm['glm_model']['submodels'] # FIX! this isn't right if we have multiple lambdas? different submodels? cms = submodels[0]['validation']['_cms'] self.assertEqual(len(thresholds), len(cms), msg="thresholds %s and cm %s should be lists of the same size. %s" % (len(thresholds), len(cms), thresholds)) # FIX! best_threshold isn't necessarily in the list. jump out if >= assert best_index<len(cms), "%s %s" % (best_index, len(cms)) # if we want 0.5..rounds to int # mid = len(cms)/2 # cm = cms[mid] cm = cms[best_index] print "cm:", dump_json(cm['_arr']) predErr = cm['_predErr'] classErr = cm['_classErr'] # compare to predErr # pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! pctWrong = 0 print "predErr:", predErr print "calculated pctWrong from cm:", pctWrong print "classErr:", classErr # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" # print h2o_gbm.pp_cm(cm['_arr']) if family=="poisson" or family=="gaussian": print "%15s %s" % ("aic:\t", validations['aic']) coefficients_names = GLMModel['coefficients_names'] # print "coefficients_names:", coefficients_names idxs = submodels1['idxs'] print "idxs:", idxs coefficients_names = coefficients_names # always check both normalized and normal coefficients norm_beta = submodels1['norm_beta'] # if norm_beta and len(coefficients_names)!=len(norm_beta): # print len(coefficients_names), len(norm_beta) # raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta)) # beta = submodels1['beta'] # print "beta:", beta # if len(coefficients_names)!=len(beta): # print len(coefficients_names), len(beta) # raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta)) # test wants to use normalized? if doNormalized: beta_used = norm_beta else: beta_used = beta coefficients = {} # create a dictionary with name, beta (including intercept) just like v1 for i,b in zip(idxs, beta_used[:-1]): name = coefficients_names[i] coefficients[name] = b print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used) print "coefficients:", coefficients print "beta:", beta print "norm_beta:", norm_beta coefficients['Intercept'] = beta_used[-1] print "len(coefficients_names)", len(coefficients_names) print "len(idxs)", len(idxs) print "idxs[-1]", idxs[-1] print "intercept demapping info:", \ "coefficients_names[-i]:", coefficients_names[-1], \ "idxs[-1]:", idxs[-1], \ "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \ "beta_used[-1]:", beta_used[-1], \ "coefficients['Intercept']", coefficients['Intercept'] # last one is intercept interceptName = coefficients_names[idxs[-1]] if interceptName != "Intercept" or abs(beta_used[-1])<1e-26: raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\ (idxs[-1], beta_used[-1], "-"+interceptName+"-")) # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names # new 5/28/14. glm can point to zero coefficients # for i in idxs: # if beta_used[i]==0.0: ## raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i])) if len(idxs) > len(beta_used): raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used))) intercept = coefficients.pop('Intercept', None) # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099 # the last one shoudl be 'Intercept' ? coefficients_names.pop() # have to skip the output col! get it from kwargs # better always be there! y = kwargs['response'] # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names' # from the response # Tomas created 'coefficients_names which is the coefficient list in order. # Just use it to index coefficients! works for header or no-header cases # I guess now we won't print the "None" cases for dropped columns (constant columns!) # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before # he gets it? def add_to_coefficient_list_and_string(c, cList, cString): if c in coefficients: cValue = coefficients[c] cValueString = "%s: %.5e " % (c, cValue) else: print "Warning: didn't see '" + c + "' in json coefficient response.",\ "Inserting 'None' with assumption it was dropped due to constant column)" cValue = None cValueString = "%s: %s " % (c, cValue) cList.append(cValue) # we put each on newline for easy comparison to R..otherwise keep condensed if prettyPrint: cValueString = "H2O coefficient " + cValueString + "\n" # not mutable? return cString + cValueString # creating both a string for printing and a list of values cString = "" cList = [] # print in order using col_names # coefficients_names is input only now..same for header or no header, or expanded enums for c in coefficients_names: cString = add_to_coefficient_list_and_string(c, cList, cString) if prettyPrint: print "\nH2O intercept:\t\t%.5e" % intercept print cString else: if not noPrint: print "\nintercept:", intercept, cString print "\nTotal # of coefficients:", len(coefficients_names) # pick out the coefficent for the column we enabled for enhanced checking. Can be None. # FIX! temporary hack to deal with disappearing/renaming columns in GLM if (not allowZeroCoeff) and (colX is not None): absXCoeff = abs(float(coefficients[str(colX)])) # add kwargs to help debug without looking at console log self.assertGreater(absXCoeff, 1e-26, ( "abs. value of GLM coefficients['" + str(colX) + "'] is " + str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) + "\n" + "kwargs:" + dump_json(kwargs) )) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ( "abs. value of GLM coefficients['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "kwargs:" + dump_json(kwargs) )) # this is good if we just want min or max # maxCoeff = max(coefficients, key=coefficients.get) # for more, just invert the dictionary and ... if (len(coefficients)>0): maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey] minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey] else: print "Warning, no coefficients returned. Must be intercept only?" # many of the GLM tests aren't single column though. # quick and dirty check: if all the coefficients are zero, # something is broken # intercept is in there too, but this will get it okay # just sum the abs value up..look for greater than 0 # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff? if (not allowZeroCoeff) and (len(coefficients)>1): s = 0.0 for c in coefficients: v = coefficients[c] s += abs(float(v)) self.assertGreater(s, 1e-26, ( "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26\n" + "kwargs:" + dump_json(kwargs) )) print "submodels1, run_time (milliseconds):", submodels1['run_time'] # shouldn't have any errors check_sandbox_for_errors() return (warnings, cList, intercept)
def pollWaitJobs(pattern=None, errorIfCancelled=False, timeoutSecs=60, pollTimeoutSecs=60, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=None): wait = True waitTime = 0 ignoredJobs = set() while (wait): a = h2o_nodes.nodes[0].jobs(timeoutSecs=pollTimeoutSecs) verboseprint("jobs():", dump_json(a)) jobs = a['jobs'] busy = 0 for j in jobs: cancelled = j['cancelled'] or (j['result'].get('val', None)=='CANCELLED') description = j['description'] destination_key = j['destination_key'] end_time = j['end_time'] key = j['key'] progress = j['progress'] # has exception and val? result = j['result'] start_time = j['start_time'] # for now, don't ignore any exceptions if 'exception' in result and result['exception']: check_sandbox_for_errors() msg = "ERROR: pollWaitJobs found a job with a exception result when it shouldn't have:\n %s" % dump_json(j) raise Exception(msg) if result: # ignore if 'val' is 'OK' if 'val' in result and result['val'] == 'OK': pass else: print "non-empty result: %s for %s" % (result, key) if errorIfCancelled and cancelled: check_sandbox_for_errors() print ("ERROR: not stopping, but: pollWaitJobs found a cancelled job when it shouldn't have:\n %s" % dump_json(j)) print ("Continuing so maybe a json response will give more info") ### verboseprint(j) # don't include cancelled jobs here elif end_time=='' and not cancelled: if not pattern: # always print progress if busy job (no pattern used print "time:", time.strftime("%I:%M:%S"), "progress:", progress, destination_key verboseprint("description:", description, "end_time:", end_time) busy +=1 verboseprint("pollWaitJobs: found a busy job, now: %s" % busy) else: if (pattern in key) or (pattern in destination_key) or (pattern in description): ## print "description:", description, "end_time:", end_time busy += 1 verboseprint("pollWaitJobs: found a pattern-matched busy job, now %s" % busy) # always print progress if pattern is used and matches print "time:", time.strftime("%I:%M:%S"), "progress:", progress, destination_key # we only want to print the warning message once elif key not in ignoredJobs: jobMsg = "%s %s %s" % (key, description, destination_key) verboseprint(" %s job in progress but we're ignoring it. Doesn't match pattern." % jobMsg) # I guess "key" is supposed to be unique over all time for a job id? ignoredJobs.add(key) if stallForNJobs: waitFor = stallForNJobs else: waitFor = 0 print " %s jobs in progress." % busy, "Waiting until %s in progress." % waitFor wait = busy > waitFor if not wait: break ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs") if not h2o_args.no_timeout and (wait and waitTime > timeoutSecs): print dump_json(jobs) raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds") sys.stdout.write('.') sys.stdout.flush() time.sleep(retryDelaySecs) waitTime += retryDelaySecs # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack) # test would pass ['cpu','disk','jstack'] kind of list if benchmarkLogging: h2o.cloudPerfH2O.get_log_save(benchmarkLogging) # check the sandbox for stack traces! just like we do when polling normally check_sandbox_for_errors() patternKeys = [] for j in jobs: # save the destination keys in progress that match pattern (for returning) if pattern and pattern in j['destination_key']: patternKeys.append(j['destination_key']) return patternKeys
def simpleCheckGBMView(node=None, gbmv=None, noPrint=False, **kwargs): if not node: node = h2o_nodes.nodes[0] if 'warnings' in gbmv: warnings = gbmv['warnings'] # catch the 'Failed to converge" for now for w in warnings: if not noPrint: print "\nwarning:", w if ('Failed' in w) or ('failed' in w): raise Exception(w) if 'cm' in gbmv: cm = gbmv['cm'] # only one else: if 'gbm_model' in gbmv: gbm_model = gbmv['gbm_model'] else: raise Exception("no gbm_model in gbmv? %s" % dump_json(gbmv)) cms = gbm_model['cms'] print "number of cms:", len(cms) print "FIX! need to add reporting of h2o's _perr per class error" # FIX! what if regression. is rf only classification? print "cms[-1]['_arr']:", cms[-1]['_arr'] print "cms[-1]['_predErr']:", cms[-1]['_predErr'] print "cms[-1]['_classErr']:", cms[-1]['_classErr'] ## print "cms[-1]:", dump_json(cms[-1]) ## for i,c in enumerate(cms): ## print "cm %s: %s" % (i, c['_arr']) cm = cms[-1]['_arr'] # take the last one scoresList = cm used_trees = gbm_model['N'] errs = gbm_model['errs'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs # if we got the ntree for comparison. Not always there in kwargs though! param_ntrees = kwargs.get('ntrees',None) if (param_ntrees is not None and used_trees != param_ntrees): raise Exception("used_trees should == param_ntree. used_trees: %s" % used_trees) if (used_trees+1)!=len(cms) or (used_trees+1)!=len(errs): raise Exception("len(cms): %s and len(errs): %s should be one more than N %s trees" % (len(cms), len(errs), used_trees)) totalScores = 0 totalRight = 0 # individual scores can be all 0 if nothing for that output class # due to sampling classErrorPctList = [] predictedClassDict = {} # may be missing some? so need a dict? for classIndex,s in enumerate(scoresList): classSum = sum(s) if classSum == 0 : # why would the number of scores for a class be 0? does GBM CM have entries for non-existent classes # in a range??..in any case, tolerate. (it shows up in test.py on poker100) if not noPrint: print "class:", classIndex, "classSum", classSum, "<- why 0?" else: # H2O should really give me this since it's in the browser, but it doesn't classRightPct = ((s[classIndex] + 0.0)/classSum) * 100 totalRight += s[classIndex] classErrorPct = round(100 - classRightPct, 2) classErrorPctList.append(classErrorPct) ### print "s:", s, "classIndex:", classIndex if not noPrint: print "class:", classIndex, "classSum", classSum, "classErrorPct:", "%4.2f" % classErrorPct # gather info for prediction summary for pIndex,p in enumerate(s): if pIndex not in predictedClassDict: predictedClassDict[pIndex] = p else: predictedClassDict[pIndex] += p totalScores += classSum #**************************** if not noPrint: print "Predicted summary:" # FIX! Not sure why we weren't working with a list..hack with dict for now for predictedClass,p in predictedClassDict.items(): print str(predictedClass)+":", p # this should equal the num rows in the dataset if full scoring? (minus any NAs) print "totalScores:", totalScores print "totalRight:", totalRight if totalScores != 0: pctRight = 100.0 * totalRight/totalScores else: pctRight = 0.0 pctWrong = 100 - pctRight print "pctRight:", "%5.2f" % pctRight print "pctWrong:", "%5.2f" % pctWrong #**************************** # more testing for GBMView # it's legal to get 0's for oobe error # if sample_rate = 1 sample_rate = kwargs.get('sample_rate', None) validation = kwargs.get('validation', None) if (sample_rate==1 and not validation): pass elif (totalScores<=0 or totalScores>5e9): raise Exception("scores in GBMView seems wrong. scores:", scoresList) varimp = gbm_model['varimp'] treeStats = gbm_model['treeStats'] if not treeStats: raise Exception("treeStats not right?: %s" % dump_json(treeStats)) # print "json:", dump_json(gbmv) data_key = gbm_model['_dataKey'] model_key = gbm_model['_key'] classification_error = pctWrong if not noPrint: if 'minLeaves' not in treeStats or not treeStats['minLeaves']: raise Exception("treeStats seems to be missing minLeaves %s" % dump_json(treeStats)) print """ Leaves: {0} / {1} / {2} Depth: {3} / {4} / {5} Err: {6:0.2f} % """.format( treeStats['minLeaves'], treeStats['meanLeaves'], treeStats['maxLeaves'], treeStats['minDepth'], treeStats['meanDepth'], treeStats['maxDepth'], classification_error, ) ### modelInspect = node.inspect(model_key) dataInspect = h2o_cmd.runInspect(key=data_key) check_sandbox_for_errors() return (round(classification_error,2), classErrorPctList, totalScores)
def pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=5): busy = True trials = 0 start = time.time() polls = 0 statSum = {} # just init for worst case 64 nodes? lastUsedMemBytes = [1 for i in range(64)] while busy: polls += 1 # get utilization and print it # any busy jobs a = h2o_nodes.nodes[0].jobs(timeoutSecs=60) busy = False for j in a['jobs']: if j['end_time']=='' and not (j['cancelled'] or (j['result'].get('val', None)=='CANCELLED')): busy = True verboseprint("Still busy") break cloudStatus = h2o_nodes.nodes[0].get_cloud(timeoutSecs=timeoutSecs) nodes = cloudStatus['nodes'] for i,n in enumerate(nodes): # check for drop in tot_mem_bytes, and report as "probably post GC" totMemBytes = n['tot_mem_bytes'] maxMemBytes = n['max_mem_bytes'] freeMemBytes = n['free_mem_bytes'] usedMemBytes = totMemBytes - freeMemBytes availMemBytes = maxMemBytes - usedMemBytes print 'Node %s:' % i, \ 'num_cpus:', n['num_cpus'],\ 'my_cpu_%:', n['my_cpu_%'],\ 'sys_cpu_%:', n['sys_cpu_%'],\ 'system_load:', n['system_load'],\ 'tot_mem_bytes: {:,}'.format(totMemBytes),\ 'max_mem_bytes: {:,}'.format(maxMemBytes),\ 'free_mem_bytes: {:,}'.format(freeMemBytes),\ 'usedMemBytes: {:,}'.format(usedMemBytes) decrease = round((0.0 + lastUsedMemBytes[i] - usedMemBytes) / lastUsedMemBytes[i], 3) if decrease > .05: print print "\nProbably GC at Node {:}: usedMemBytes decreased by {:f} pct.. {:,} {:,}".format(i, 100 * decrease, lastUsedMemBytes[i], usedMemBytes) lastUsedMemBytes[i] = usedMemBytes # don't update lastUsedMemBytes if we're decreasing if usedMemBytes > lastUsedMemBytes[i]: lastUsedMemBytes[i] = usedMemBytes # sum all individual stats for stat in n: if stat in statSum: try: statSum[stat] += n[stat] except TypeError: # raise Exception("statSum[stat] should be number %s %s" % (statSum[stat], stat, n[stat])) print "ERROR: statSum[stat] should be number %s %s %s" % (statSum[stat], stat, n[stat]) # do nothing else: try: statSum[stat] = n[stat] + 0.0 except TypeError: pass # ignore non-numbers trials += 1 if trials%5 == 0: check_sandbox_for_errors() time.sleep(retryDelaySecs) if not h2o_args.no_timeout and ((time.time() - start) > timeoutSecs): raise Exception("Timeout while polling in pollStatsWhileBusy: %s seconds" % timeoutSecs) # now print man print "Did %s polls" % polls statMean = {} print "Values are summed across all nodes (cloud members), so divide by node count" for s in statSum: statMean[s] = round((statSum[s] + 0.0) / polls, 2) print "per poll mean", s + ':', statMean[s] return statMean
def simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, allowNaN=False): warnings = '' rank = model.rank binomial = model.binomial residual_deviance = model.residual_deviance threshold = model.threshold check_obj_has_good_numbers(threshold, 'threshold', allowNaN=allowNaN) auc = model.AUC # NaN if not logistic # check_obj_has_good_numbers(auc, 'model.AUC') best_lambda_idx = model.best_lambda_idx model_category = model.model_category name = model.name residual_degrees_of_freedom = model.residual_degrees_of_freedom # is this no longer used? coefficients_magnitude = model.coefficients_magnitude null_deviance = model.null_deviance check_obj_has_good_numbers(null_deviance, 'model.null_deviance', allowNaN=allowNaN) null_degrees_of_freedom = model.null_degrees_of_freedom check_obj_has_good_numbers(null_degrees_of_freedom, 'model.null_degrees_of_freedom', allowNaN=allowNaN) domains = model.domains # when is is this okay to be NaN? AIC = model.AIC check_obj_has_good_numbers(AIC, 'model.AIC', allowNaN=allowNaN) names = model.names coeffs_names = model.coefficients_table.data[0] # these are returned as quoted strings. Turn them into numbers temp = model.coefficients_table.data[1] assert len(coeffs_names) == len(temp), "%s %s" % (len(coeffs_names), len(temp)) # we need coefficients to be floats or empty check_obj_has_good_numbers(temp, 'model.coeffs', allowNaN=False) # print "temp", temp[0:10] # print "temp[5489:5500]", temp[5489:5500] # UPDATE: None (null json) is legal for coeffs coeffs = map(lambda x: float(x) if (x is not None and str(x) != "") else 0, temp) intercept = coeffs[-1] interceptName = coeffs_names[-1] assert interceptName == 'Intercept' assert len(coeffs) == len(coeffs_names), "%s %s" % (len(coeffs), len(coeffs_names)) # FIX! if a coeff is zeroed/ignored, it doesn't show up? # get rid of intercept in glm response # assert (len(coeffs)-1) == len(labelListUsed, \ # "%s %s %s %s" % (len(coeffs), len(labelListUsed), coeffs, labelListUsed) # labelList still has the response column? # ignored columns aren't in model.names, but output response is. # labelListUsed has the response col removed so add 1 # Hmm..dropped coefficients again? can't do this check? # assert len(model.names) == len(labelListUsed), \ # "%s %s %s %s" % (len(model.names), len(labelListUsed), model.names, labelList) # this is no longer true! # assert model.threshold!=0 print "len(coeffs)", len(coeffs) print "coeffs:", coeffs # last one is intercept if interceptName != "Intercept" or abs(intercept) < 1e-26: raise Exception("'Intercept' should be last in coeffs_names %s %s" % (interceptName, intercept)) y = parameters['response_column'] cString = "\n" for i, c in enumerate(coeffs_names): cString += "%s: %.5e " % (coeffs_names[i], coeffs[i]) print cString print "\nH2O intercept:\t\t%.5e" % intercept print "\nTotal # of coeffs:", len(coeffs_names) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ("abs. value of GLM coeffs['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "parameters:" + dump_json(parameters))) if (not allowZeroCoeff) and (len(coeffs) > 1): s = 0.0 for c in coeffs: s += abs(float(c)) self.assertGreater( s, 1e-26, ("sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" + "parameters:" + dump_json(parameters))) # shouldn't have any errors check_sandbox_for_errors() return (warnings, coeffs, intercept)
def simpleCheckRFView(node=None, rfv=None, checkScoringOnly=False, noPrint=False, **kwargs): if not node: node = h2o_nodes.nodes[0] if 'warnings' in rfv: warnings = rfv['warnings'] # catch the 'Failed to converge" for now for w in warnings: if not noPrint: print "\nwarning:", w if ('Failed' in w) or ('failed' in w): raise Exception(w) #**************************** # if we are checking after confusion_matrix for predict, the jsonschema is different if 'cm' in rfv: cm = rfv['cm'] # only one else: if 'drf_model' in rfv: rf_model = rfv['drf_model'] elif 'speedrf_model' in rfv: rf_model = rfv['speedrf_model'] elif 'rf_model' in rfv: rf_model = rfv['rf_model'] else: raise Exception("no rf_model in rfv? %s" % dump_json(rfv)) cms = rf_model['cms'] print "number of cms:", len(cms) print "FIX! need to add reporting of h2o's _perr per class error" # FIX! what if regression. is rf only classification? print "cms[-1]['_arr']:", cms[-1]['_arr'] print "cms[-1]['_predErr']:", cms[-1]['_predErr'] print "cms[-1]['_classErr']:", cms[-1]['_classErr'] ## print "cms[-1]:", dump_json(cms[-1]) ## for i,c in enumerate(cms): ## print "cm %s: %s" % (i, c['_arr']) cm = cms[-1]['_arr'] # take the last one scoresList = cm if not checkScoringOnly: used_trees = rf_model['N'] errs = rf_model['errs'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs # if we got the ntree for comparison. Not always there in kwargs though! param_ntrees = kwargs.get('ntrees', None) if (param_ntrees is not None and used_trees != param_ntrees): raise Exception("used_trees should == param_ntree. used_trees: %s" % used_trees) if (used_trees+1)!=len(cms) or (used_trees+1)!=len(errs): raise Exception("len(cms): %s and len(errs): %s should be one more than N %s trees" % (len(cms), len(errs), used_trees)) #**************************** totalScores = 0 totalRight = 0 # individual scores can be all 0 if nothing for that output class # due to sampling classErrorPctList = [] predictedClassDict = {} # may be missing some? so need a dict? for classIndex,s in enumerate(scoresList): classSum = sum(s) if classSum == 0 : # why would the number of scores for a class be 0? does RF CM have entries for non-existent classes # in a range??..in any case, tolerate. (it shows up in test.py on poker100) if not noPrint: print "class:", classIndex, "classSum", classSum, "<- why 0?" else: # H2O should really give me this since it's in the browser, but it doesn't classRightPct = ((s[classIndex] + 0.0)/classSum) * 100 totalRight += s[classIndex] classErrorPct = round(100 - classRightPct, 2) classErrorPctList.append(classErrorPct) ### print "s:", s, "classIndex:", classIndex if not noPrint: print "class:", classIndex, "classSum", classSum, "classErrorPct:", "%4.2f" % classErrorPct # gather info for prediction summary for pIndex,p in enumerate(s): if pIndex not in predictedClassDict: predictedClassDict[pIndex] = p else: predictedClassDict[pIndex] += p totalScores += classSum #**************************** if not noPrint: print "Predicted summary:" # FIX! Not sure why we weren't working with a list..hack with dict for now for predictedClass,p in predictedClassDict.items(): print str(predictedClass)+":", p # this should equal the num rows in the dataset if full scoring? (minus any NAs) print "totalScores:", totalScores print "totalRight:", totalRight if totalScores != 0: pctRight = 100.0 * totalRight/totalScores else: pctRight = 0.0 pctWrong = 100 - pctRight print "pctRight:", "%5.2f" % pctRight print "pctWrong:", "%5.2f" % pctWrong if checkScoringOnly: check_sandbox_for_errors() classification_error = pctWrong return (round(classification_error,2), classErrorPctList, totalScores) # it's legal to get 0's for oobe error # if sample_rate = 1 sample_rate = kwargs.get('sample_rate', None) validation = kwargs.get('validation', None) print "kevin:", sample_rate, validation if (sample_rate==1 and not validation): pass elif (totalScores<=0 or totalScores>5e9): raise Exception("scores in RFView seems wrong. scores:", scoresList) varimp = rf_model['varimp'] if 'importance' in kwargs and kwargs['importance']: max_var = varimp['max_var'] variables = varimp['variables'] varimpSD = varimp['varimpSD'] varimp2 = varimp['varimp'] # what is max_var? it's 100 while the length of the others is 54 for covtype if not max_var: raise Exception("varimp.max_var is None? %s" % max_var) # if not variables: # raise Exception("varimp.variables is None? %s" % variables) if not varimpSD: raise Exception("varimp.varimpSD is None? %s" % varimpSD) if not varimp2: raise Exception("varimp.varimp is None? %s" % varimp2) # check that they all have the same length and that the importance is not all zero # if len(varimpSD)!=max_var or len(varimp2)!=max_var or len(variables)!=max_var: # raise Exception("varimp lists seem to be wrong length: %s %s %s" % \ # (max_var, len(varimpSD), len(varimp2), len(variables))) # not checking maxvar or variables. Don't know what they should be if len(varimpSD) != len(varimp2): raise Exception("varimp lists seem to be wrong length: %s %s" % \ (len(varimpSD), len(varimp2))) h2o_util.assertApproxEqual(sum(varimp2), 0.0, tol=1e-5, msg="Shouldn't have all 0's in varimp %s" % varimp2) treeStats = rf_model['treeStats'] if not treeStats: raise Exception("treeStats not right?: %s" % dump_json(treeStats)) # print "json:", dump_json(rfv) data_key = rf_model['_dataKey'] model_key = rf_model['_key'] classification_error = pctWrong if not noPrint: if 'minLeaves' not in treeStats or not treeStats['minLeaves']: raise Exception("treeStats seems to be missing minLeaves %s" % dump_json(treeStats)) print """ Leaves: {0} / {1} / {2} Depth: {3} / {4} / {5} Err: {6:0.2f} % """.format( treeStats['minLeaves'], treeStats['meanLeaves'], treeStats['maxLeaves'], treeStats['minDepth'], treeStats['meanDepth'], treeStats['maxDepth'], classification_error, ) ### modelInspect = node.inspect(model_key) dataInspect = h2o_cmd.runInspect(key=data_key) check_sandbox_for_errors() return (round(classification_error,2), classErrorPctList, totalScores)