def rapids(self, timeoutSecs=120, ignoreH2oError=False, **kwargs): # FIX! assume both of these are strings for now, not lists if 'ast' in kwargs and kwargs['ast'] is not None: assert isinstance(kwargs['ast'], basestring), "only string assumed? %s" % kwargs['ast'] if 'funs' in kwargs and kwargs['funs'] is not None: assert isinstance(kwargs['funs'], basestring), "only string assumed? %s" % kwargs['funs'] # currently runExec only does one or the other params_dict = { 'ast': None, 'funs': None, } check_params_update_kwargs(params_dict, kwargs, 'rapids', True) if 1==1: result = self.do_json_request('Rapids.json', cmd='post', timeout=timeoutSecs, postData=params_dict) else: result = self.do_json_request('Rapids.json', timeout=timeoutSecs, params=params_dict) verboseprint("rapids result:", dump_json(result)) # FIX! maybe add something for ignoring conditionally? if 'exception' in result and result['exception'] and not ignoreH2oError: exception = result['exception'] raise Exception('rapids with kwargs:\n%s\ngot exception:\n"%s"\n' % (dump_json(kwargs), exception)) h2o_sandbox.check_sandbox_for_errors() return result
def model_metrics(self, timeoutSecs=60, **kwargs): ''' ModelMetrics list. ''' result = self.do_json_request('/3/ModelMetrics.json', cmd='get', timeout=timeoutSecs) h2o_sandbox.check_sandbox_for_errors() return result
def model_builders(self, algo=None, timeoutSecs=10, **kwargs): ''' Return a model builder or all of the model builders known to the h2o cluster. The model builders are contained in a dictionary called "model_builders" at the top level of the result. The dictionary maps algorithm names to parameters lists. Each of the parameters contains all the metdata required by a client to present a model building interface to the user. if parameters = True, return the parameters? ''' params_dict = {} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'model_builders', False) request = '3/ModelBuilders.json' if algo: request += "/" + algo result = self.do_json_request(request, timeout=timeoutSecs, params=params_dict) # verboseprint(request, "result:", dump_json(result)) h2o_sandbox.check_sandbox_for_errors() return result
def split_frame(self, timeoutSecs=120, noPoll=False, **kwargs): params_dict = { 'dataset': None, 'ratios': None, 'destKeys': None, # ['bigger', 'smaller'] } check_params_update_kwargs(params_dict, kwargs, 'split_frame', print_params=True) firstResult = self.do_json_request('3/SplitFrame.json', cmd='post', timeout=timeoutSecs, params=params_dict) print "firstResult:", dump_json(firstResult) # FIX! what is ['dest']['name'] ..It's not there at the beginning? job_key = firstResult['key']['name'] if noPoll: h2o_sandbox.check_sandbox_for_errors() return firstResult # is it polllable while it's in the CREATED state? msec looks wrong. start_time is 0 time.sleep(2) result = self.poll_job(job_key) verboseprint("split_frame result:", dump_json(result)) return result
def rapids_iseval(self, timeoutSecs=120, ignoreH2oError=False, **kwargs): # FIX! assume both of these are strings for now, not lists if 'ast_key' in kwargs and kwargs['ast_key'] is not None: assert isinstance( kwargs['ast_key'], basestring), "only string assumed? %s" % kwargs['ast_key'] # currently runExec only does one or the other params_dict = { 'ast_key': None, } check_params_update_kwargs(params_dict, kwargs, 'rapids_iseval', True) # doesn't like 'put' here? # doesn't like empty key result = self.do_json_request('3/Rapids.json/isEval', cmd='get', timeout=timeoutSecs, params=params_dict) verboseprint("rapids_iseval result:", dump_json(result)) # FIX! maybe add something for ignoring conditionally? if 'exception' in result and result['exception'] and not ignoreH2oError: exception = result['exception'] raise Exception('rapids with kwargs:\n%s\ngot exception:\n"%s"\n' % (dump_json(kwargs), exception)) h2o_sandbox.check_sandbox_for_errors() return result
def model_metrics(self, timeoutSecs=60, **kwargs): """ ModelMetrics list. """ result = self.do_json_request("/3/ModelMetrics.json", cmd="get", timeout=timeoutSecs) h2o_sandbox.check_sandbox_for_errors() return result
def compute_model_metrics(self, model, frame, timeoutSecs=60, **kwargs): """ Score a model on the h2o cluster on the given Frame and return only the model metrics. """ assert model is not None, '"model" parameter is null' assert frame is not None, '"frame" parameter is null' models = self.models(key=model, timeoutSecs=timeoutSecs) assert models is not None, "/Models REST call failed" assert ( models["models"][0]["model_id"]["name"] == model ), "/Models/{0} returned Model {1} rather than Model {2}".format(model, models["models"][0]["key"]["name"], model) # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=frame) assert frames is not None, "/Frames/{0} REST call failed".format(frame) print "frames:", dump_json(frames) # is the name not there? # assert frames['frames'][0]['model_id']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, models['models'][0]['key']['name'], frame) result = self.do_json_request( "/3/ModelMetrics.json/models/" + model + "/frames/" + frame, cmd="post", timeout=timeoutSecs ) mm = result["model_metrics"][0] verboseprint("model metrics: " + repr(mm)) h2o_sandbox.check_sandbox_for_errors() return mm
def compute_model_metrics(self, model, frame, timeoutSecs=60, **kwargs): ''' Score a model on the h2o cluster on the given Frame and return only the model metrics. ''' assert model is not None, '"model" parameter is null' assert frame is not None, '"frame" parameter is null' models = self.models(key=model, timeoutSecs=timeoutSecs) assert models is not None, "/Models REST call failed" assert models['models'][0]['model_id'][ 'name'] == model, "/Models/{0} returned Model {1} rather than Model {2}".format( model, models['models'][0]['key']['name'], model) # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=frame) assert frames is not None, "/Frames/{0} REST call failed".format(frame) print "frames:", dump_json(frames) # is the name not there? # assert frames['frames'][0]['model_id']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, models['models'][0]['key']['name'], frame) result = self.do_json_request('/3/ModelMetrics.json/models/' + model + '/frames/' + frame, cmd='post', timeout=timeoutSecs) mm = result['model_metrics'][0] verboseprint("model metrics: " + repr(mm)) h2o_sandbox.check_sandbox_for_errors() return mm
def models(self, key=None, timeoutSecs=10, **kwargs): ''' Return all of the models in the h2o cluster, or a single model given its key. The models are contained in a list called "models" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_frames is implemented then the top level dict will also contain a "frames" list. ''' params_dict = {'find_compatible_frames': False} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True) if key: # result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict) # print "for ray:", dump_json(result) result = self.do_json_request('3/Models.json/' + key, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict) verboseprint("models result:", dump_json(result)) h2o_sandbox.check_sandbox_for_errors() return result
def import_files(self, path, timeoutSecs=180): """ Import a file or files into h2o. The 'file' parameter accepts a directory or a single file. 192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets """ a = self.do_json_request("3/ImportFiles.json", timeout=timeoutSecs, params={"path": path}) verboseprint("\nimport_files result:", dump_json(a)) h2o_sandbox.check_sandbox_for_errors() return a
def import_files(self, path, timeoutSecs=180): ''' Import a file or files into h2o. The 'file' parameter accepts a directory or a single file. 192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets ''' a = self.do_json_request('3/ImportFiles.json', timeout=timeoutSecs, params={"path": path}) verboseprint("\nimport_files result:", dump_json(a)) h2o_sandbox.check_sandbox_for_errors() return a
def summary(self, key, column="C1", timeoutSecs=10, **kwargs): ''' Return the summary for a single column for a single Frame in the h2o cluster. ''' params_dict = { # 'offset': 0, # 'len': 100 } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True) result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict) h2o_sandbox.check_sandbox_for_errors() return result
def summary(self, key, column="C1", timeoutSecs=10, **kwargs): ''' Return the summary for a single column for a single Frame in the h2o cluster. ''' params_dict = { 'offset': 0, 'len': 100 } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True) result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict) h2o_sandbox.check_sandbox_for_errors() return result
def quantiles(self, timeoutSecs=300, print_params=True, **kwargs): params_dict = { 'destination_key': None, 'training_frame': None, 'validation_frame': None, 'ignored_columns': None, 'score_each_iteration': None, 'probs': None, } check_params_update_kwargs(params_dict, kwargs, 'quantiles', print_params) a = self.do_json_request('3/Quantiles.json', timeout=timeoutSecs, params=params_dict) verboseprint("\nquantiles result:", dump_json(a)) h2o_sandbox.check_sandbox_for_errors() return a
def quantiles(self, timeoutSecs=300, print_params=True, **kwargs): params_dict = { 'source_key': None, 'column': None, 'quantile': None, 'max_qbins': None, 'interpolation_type': None, 'multiple_pass': None, } check_params_update_kwargs(params_dict, kwargs, 'quantiles', print_params) a = self.do_json_request('Quantiles.json', timeout=timeoutSecs, params=params_dict) verboseprint("\nquantiles result:", dump_json(a)) h2o_sandbox.check_sandbox_for_errors() return a
def quantiles(self, timeoutSecs=300, print_params=True, **kwargs): params_dict = { 'destination_key': None, 'training_frame': None, 'validation_frame': None, 'ignored_columns': None, 'score_each_iteration': None, 'probs': None, } check_params_update_kwargs(params_dict, kwargs, 'quantiles', print_params) a = self.do_json_request('Quantiles.json', timeout=timeoutSecs, params=params_dict) verboseprint("\nquantiles result:", dump_json(a)) h2o_sandbox.check_sandbox_for_errors() return a
def frame_split(self, timeoutSecs=120, noPoll=False, **kwargs): params_dict = { 'training_frame': None, 'ratios': None, } check_params_update_kwargs(params_dict, kwargs, 'frame_split', print_params=True) firstResult = self.do_json_request('SplitFrame.json', timeout=timeoutSecs, params=params_dict) job_key = firstResult['job']['key']['name'] if noPoll: h2o_sandbox.check_sandbox_for_errors() return firstResult result = self.poll_job(job_key) verboseprint("frame_split result:", dump_json(result)) return result
def interaction(self, timeoutSecs=120, noPoll=False, **kwargs): # FIX! have to add legal params params_dict = { } check_params_update_kwargs(params_dict, kwargs, 'interaction', print_params=True) firstResult = self.do_json_request('3/Interaction.json', cmd='post', timeout=timeoutSecs, params=params_dict) job_key = firstResult['dest']['name'] if noPoll: h2o_sandbox.check_sandbox_for_errors() return firstResult result = self.poll_job(job_key) verboseprint("interaction result:", dump_json(result)) return result
def create_frame(self, timeoutSecs=120, noPoll=False, **kwargs): # FIX! have to add legal params params_dict = { } check_params_update_kwargs(params_dict, kwargs, 'create_frame', print_params=True) firstResult = self.do_json_request('3/CreateFrame.json', cmd='post', timeout=timeoutSecs, params=params_dict) job_key = firstResult['dest']['name'] if noPoll: h2o_sandbox.check_sandbox_for_errors() return firstResult result = self.poll_job(job_key) verboseprint("create_frame result:", dump_json(result)) return result
def poll_job2(self, firstResult, algo=None, timeoutSecs=60, noPoll=False, **kwargs): if noPoll: result = firstResult elif ('validation_error_count' in firstResult) and (firstResult['validation_error_count'] > 0): h2p.yellow_print("parameter error in %s" % algo) result = firstResult else: job_result = result1['jobs'][0] job_key = job_result['key']['name'] verboseprint("%s job_key: %s" % (algo, job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) elapsed = time.time() - start print algo, " end on ", training_frame, 'took', time.time( ) - start, 'seconds' print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on %s job status: %s %s %s %s" % \ (algo, status, progress, msec, description)) result = job_result else: raise Exception( "build_model didn't get a job_result when it expected one") verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() return result
def predict(self, model, frame, timeoutSecs=60, **kwargs): assert model is not None, '"model" parameter is null' assert frame is not None, '"frame" parameter is null' models = self.models(key=model, timeoutSecs=timeoutSecs) assert models is not None, "/Models REST call failed" assert models['models'][0]['key'] == model, "/Models/{0} returned Model {1} rather than Model {2}".format(model, models['models'][0]['key']['name'], model) # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=frame) assert frames is not None, "/Frames/{0} REST call failed".format(frame) assert frames['frames'][0]['key']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, frames['frames'][0]['key']['name'], frame) result = self.do_json_request('/3/Predictions.json/models/' + model + '/frames/' + frame, cmd='post', timeout=timeoutSecs) h2o_sandbox.check_sandbox_for_errors() return result
def predict(self, model, frame, timeoutSecs=60, **kwargs): assert model is not None, '"model" parameter is null' assert frame is not None, '"frame" parameter is null' models = self.models(key=model, timeoutSecs=timeoutSecs) assert models is not None, "/Models REST call failed" assert models['models'][0]['key']['name'] == model, "/Models/{0} returned Model {1} rather than Model {2}".format(model, models['models'][0]['key']['name'], model) # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=frame) assert frames is not None, "/Frames/{0} REST call failed".format(frame) assert frames['frames'][0]['key']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, frames['frames'][0]['key']['name'], frame) result = self.do_json_request('/3/Predictions.json/models/' + model + '/frames/' + frame, cmd='post', timeout=timeoutSecs) h2o_sandbox.check_sandbox_for_errors() return result
def model_builders(self, algo=None, timeoutSecs=10, **kwargs): ''' Return a model builder or all of the model builders known to the h2o cluster. The model builders are contained in a dictionary called "model_builders" at the top level of the result. The dictionary maps algorithm names to parameters lists. Each of the parameters contains all the metdata required by a client to present a model building interface to the user. if parameters = True, return the parameters? ''' params_dict = {} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'model_builders', False) request = '2/ModelBuilders.json' if algo: request += "/" + algo result = self.do_json_request(request, timeout=timeoutSecs, params=params_dict) # verboseprint(request, "result:", dump_json(result)) h2o_sandbox.check_sandbox_for_errors() return result
def models(self, key=None, timeoutSecs=10, **kwargs): ''' Return all of the models in the h2o cluster, or a single model given its key. The models are contained in a list called "models" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_frames is implemented then the top level dict will also contain a "frames" list. ''' params_dict = { 'find_compatible_frames': False } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True) if key: result = self.do_json_request('3/Models.json/' + key, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict) verboseprint("models result:", dump_json(result)) h2o_sandbox.check_sandbox_for_errors() return result
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs): ''' Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out. ''' params_dict = {} # merge kwargs into params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False) start_time = time.time() pollCount = 0 while True: result = self.do_json_request('2/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict) # print 'Job: ', dump_json(result) if key: frames_result = self.frames(key=key) print 'frames_result for key:', key, dump_json(result) jobs = result['jobs'][0] description = jobs['description'] dest = jobs['dest'] dest_name = dest['name'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] print description, \ "dest_name:", dest_name, \ "\tprogress:", "%-10s" % progress, \ "\tstatus:", "%-12s" % status, \ "\tmsec:", msec if status == 'DONE' or status == 'CANCELLED' or status == 'FAILED': h2o_sandbox.check_sandbox_for_errors() return result # FIX! what are the other legal polling statuses that we should check for? if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs): h2o_sandbox.check_sandbox_for_errors() emsg = "Job:", job_key, "timed out in:", timeoutSecs raise Exception(emsg) print emsg return None # check every other poll, for now if (pollCount % 2) == 0: h2o_sandbox.check_sandbox_for_errors() time.sleep(retryDelaySecs) pollCount += 1
def poll_job2(self, firstResult, algo=None, timeoutSecs=60, noPoll=False, **kwargs): if noPoll: result = firstResult elif 'validation_error_count' in firstResult: h2p.yellow_print("parameter error in %s" % algo) result = firstResult else: job_result = result1['jobs'][0] job_key = job_result['key']['name'] verboseprint("%s job_key: %s" % (algo, job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) elapsed = time.time() - start print algo, " end on ", training_frame, 'took', time.time() - start, 'seconds' print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on %s job status: %s %s %s %s" % \ (algo, status, progress, msec, description)) result = job_result else: raise Exception("build_model didn't get a job_result when it expected one") verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() return result
def csv_download(self, key, csvPathname, timeoutSecs=60, **kwargs): params = { 'key': key } paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) url = self.url('DownloadDataset.json') log('Start ' + url + paramsStr, comment=csvPathname) # do it (absorb in 1024 byte chunks) r = requests.get(url, params=params, timeout=timeoutSecs) print "csv_download r.headers:", r.headers if r.status_code == 200: f = open(csvPathname, 'wb') for chunk in r.iter_content(1024): f.write(chunk) else: raise Exception("unexpected status for DownloadDataset: %s" % r.status_code) print csvPathname, "size:", h2o_util.file_size_formatted(csvPathname) h2o_sandbox.check_sandbox_for_errors() # FIX! we're skipping all the checks in do_json_request. And no json return? return
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs): ''' Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out. ''' params_dict = {} # merge kwargs into params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False) start_time = time.time() pollCount = 0 while True: result = self.do_json_request('3/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict) # print 'Job: ', dump_json(result) if key: frames_result = self.frames(key=key) print 'frames_result for key:', key, dump_json(result) jobs = result['jobs'][0] description = jobs['description'] dest = jobs['dest'] dest_name = dest['name'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] print description, \ "dest_name:", dest_name, \ "\tprogress:", "%-10s" % progress, \ "\tstatus:", "%-12s" % status, \ "\tmsec:", msec if status=='DONE' or status=='CANCELLED' or status=='FAILED': h2o_sandbox.check_sandbox_for_errors() return result # what about 'CREATED' # FIX! what are the other legal polling statuses that we should check for? if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs): h2o_sandbox.check_sandbox_for_errors() emsg = "Job:", job_key, "timed out in:", timeoutSecs # for debug a = h2o.nodes[0].get_cloud() print "cloud.json:", dump_json(a) raise Exception(emsg) print emsg return None # check every other poll, for now if (pollCount % 2) == 0: h2o_sandbox.check_sandbox_for_errors() time.sleep(retryDelaySecs) pollCount += 1
def check_sandbox_for_errors(cloudShutdownIsError=False, sandboxIgnoreErrors=False, python_test_name=''): # dont' have both tearDown and tearDownClass report the same found error # only need the first if nodes and nodes[0].sandbox_error_report(): # gets current state return # Can build a cloud that ignores all sandbox things that normally fatal the test # Kludge, test will set this directly if it wants, rather than thru build_cloud parameter. # we need the sandbox_ignore_errors, for the test teardown_cloud..the state disappears! ignore = sandboxIgnoreErrors or (nodes and nodes[0].sandbox_ignore_errors) errorFound = h2o_sandbox.check_sandbox_for_errors( LOG_DIR=LOG_DIR, sandboxIgnoreErrors=ignore, cloudShutdownIsError=cloudShutdownIsError, python_test_name=python_test_name) if errorFound and nodes: nodes[0].sandbox_error_report(True) # sets
def check_sandbox_for_errors(cloudShutdownIsError=False, sandboxIgnoreErrors=False, python_test_name=''): # dont' have both tearDown and tearDownClass report the same found error # only need the first global sandbox_error_was_reported if sandbox_error_was_reported: # gets current state return # Can build a cloud that ignores all sandbox things that normally fatal the test # Kludge, test will set this directly if it wants, rather than thru build_cloud parameter. # we need the sandbox_ignore_errors, for the test teardown_cloud..the state disappears! ignore = sandboxIgnoreErrors or (h2o_nodes.nodes and h2o_nodes.nodes[0].sandbox_ignore_errors) errorFound = h2o_sandbox.check_sandbox_for_errors( LOG_DIR=LOG_DIR, sandboxIgnoreErrors=ignore, cloudShutdownIsError=cloudShutdownIsError, python_test_name=python_test_name) if errorFound: sandbox_error_was_reported = True
def build_model(self, algo, training_frame, parameters, destination_key=None, timeoutSecs=60, asynchronous=False, **kwargs): ''' Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. ''' assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders['model_builders'], "%s %s" % (algo, [k for k in model_builders['model_builders']]) builder = model_builders['model_builders'][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format(training_frame) key_name = frames['frames'][0]['key']['name'] assert key_name==training_frame, \ "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame) parameters['training_frame'] = training_frame if destination_key is not None: parameters['destination_key'] = destination_key print "build_model parameters", parameters result1 = self.do_json_request('/2/ModelBuilders.json/' + algo, cmd='post', timeout=timeoutSecs, postData=parameters) verboseprint("build_model result", dump_json(result1)) if asynchronous: result = result1 elif 'validation_error_count' in result1: h2p.yellow_print("parameter error in model_builders") # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 else: job_result = result1['jobs'][0] job_key = job_result['key']['name'] verboseprint("build_model job_key: " + repr(job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on build_model job status: %s %s %s %s" % \ (status, progress, msec, description)) result = job_result else: # ? we should always get a job_json result raise Exception("build_model didn't get a job_result when it expected one") # return None verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() return result
def build_model( self, algo, training_frame, parameters, destination_frame=None, model_id=None, timeoutSecs=60, noPoll=False, **kwargs ): if "destination_key" in kwargs: raise Exception("Change destination_key in build_model() to model_id") """ Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. """ assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders["model_builders"], "%s %s" % (algo, [k for k in model_builders["model_builders"]]) builder = model_builders["model_builders"][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format(training_frame) key_name = frames["frames"][0]["frame_id"]["name"] assert key_name == training_frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format( training_frame, key_name, training_frame ) parameters["training_frame"] = training_frame if destination_frame is not None: print "destination_frame should be replaced by model_id now" parameters["model_id"] = destination_frame if model_id is not None: parameters["model_id"] = model_id print "build_model parameters", parameters start = time.time() result1 = self.do_json_request( "/3/ModelBuilders.json/" + algo, cmd="post", timeout=timeoutSecs, postData=parameters ) # make get overwritten after polling elapsed = time.time() - start verboseprint("build_model result", dump_json(result1)) if noPoll: result = result1 elif ("validation_error_count" in result1) and (result1["validation_error_count"] > 0): h2p.yellow_print("parameter error in model_builders: %s" % result1) # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 # don't bother printing a time message elif "exception_msg" in result1: h2p.yellow_print("exception msg in model_builders: %s" % result1["exception_msg"]) result = result1 else: job_result = result1["job"] job_key = job_result["key"]["name"] verboseprint("build_model job_key: " + repr(job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) elapsed = time.time() - start print "ModelBuilders", algo, "end on", training_frame, "took", time.time() - start, "seconds" print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) if job_result: jobs = job_result["jobs"][0] description = jobs["description"] dest = jobs["dest"] msec = jobs["msec"] status = jobs["status"] progress = jobs["progress"] # can condition this with a parameter if some FAILED are expected by tests. if status == "FAILED": print dump_json(job_result) raise Exception( "Taking exception on build_model job status: %s %s %s %s" % (status, progress, msec, description) ) result = job_result else: # ? we should always get a job_json result raise Exception("build_model didn't get a job_result when it expected one") # return None verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() result["python_elapsed"] = elapsed return result
def build_model(self, algo, training_frame, parameters, destination_frame=None, model_id=None, timeoutSecs=60, noPoll=False, **kwargs): if 'destination_key' in kwargs: raise Exception('Change destination_key in build_model() to model_id') ''' Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. ''' assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders['model_builders'], "%s %s" % ( algo, [k for k in model_builders['model_builders']]) builder = model_builders['model_builders'][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format( training_frame) key_name = frames['frames'][0]['frame_id']['name'] assert key_name==training_frame, \ "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame) parameters['training_frame'] = training_frame if destination_frame is not None: print "destination_frame should be replaced by model_id now" parameters['model_id'] = destination_frame if model_id is not None: parameters['model_id'] = model_id print "build_model parameters", parameters start = time.time() result1 = self.do_json_request('/3/ModelBuilders.json/' + algo, cmd='post', timeout=timeoutSecs, postData=parameters) # make get overwritten after polling elapsed = time.time() - start verboseprint("build_model result", dump_json(result1)) if noPoll: result = result1 elif ('validation_error_count' in result1) and (result1['validation_error_count'] > 0): h2p.yellow_print("parameter error in model_builders: %s" % result1) # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 # don't bother printing a time message elif 'exception_msg' in result1: h2p.yellow_print("exception msg in model_builders: %s" % result1['exception_msg']) result = result1 else: job_result = result1['job'] job_key = job_result['key']['name'] verboseprint("build_model job_key: " + repr(job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) elapsed = time.time() - start print "ModelBuilders", algo, "end on", training_frame, 'took', time.time( ) - start, 'seconds' print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on build_model job status: %s %s %s %s" % \ (status, progress, msec, description)) result = job_result else: # ? we should always get a job_json result raise Exception( "build_model didn't get a job_result when it expected one") # return None verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() result['python_elapsed'] = elapsed return result
#!/usr/bin/python import sys sys.path.extend(['.', '..', 'py']) import h2o_sandbox print "Will look at all the files in ./sandbox assuming they are stdout/stderr log files" h2o_sandbox.check_sandbox_for_errors(pattern='*')
def parse(self, key, hex_key=None, columnTypeDict=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'source_frames': None, 'destination_frame': hex_key, 'parse_type': None, # file type 'separator': None, 'single_quotes': None, 'check_header': None, # forces first line to be seen as column names 'number_columns': None, 'column_names': None, # a list 'column_types': None, # a list. or can use columnTypeDict param (see below) 'na_strings': None, # a list 'chunk_size': None, # are these two no longer supported? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception( "key seems to be bad in parse. Should be list or string. %s" % key) # have to put double quotes around the individual list items (single not legal) source_frames = "[" + ",".join(map( (lambda x: '"' + x + '"'), key)) + "]" else: # what if None here source_frames = '["' + key + '"]' # quotes required on key params_dict['source_frames'] = source_frames # merge kwargs into params_dict # =None overwrites params_dict # columnTypeDict not used here h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?source_frames=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'source_frames': source_frames} setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # this should match what we gave as input? if setup_result['source_frames']: # should these be quoted? source_framesStr = "[" + ",".join([ ('"%s"' % src['name']) for src in setup_result['source_frames'] ]) + "]" else: source_framesStr = None # I suppose we need a way for parameters to parse() to override these # should it be an array or a dict? if setup_result['column_names']: # single quotes not legal..need double quotes columnNamesStr = "[" + ",".join( map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]" else: columnNamesStr = None columnTypes = setup_result['column_types'] assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes) if setup_result['na_strings']: # single quotes not legal..need double quotes naStrings = "[" + ",".join( map((lambda x: '"' + x + '"' if x != None else '""'), setup_result['na_strings'])) + "]" else: naStrings = None # dict parameter to update columnTypeDict? # but we don't pass columnNames like this? ct = setup_result['column_types'] if columnTypeDict: for k, v in columnTypeDict.iteritems(): if isinstance(k, int): # if a column index if k >= 0 and k < len(ct): ct[k] = v else: raise Exception( "bad col index %s in columnTypeDict param %s" % (k, columnTypeDict)) # if a column name elif isinstance(k, basestring): # find the index if k not in columnNames: raise Exception( "bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames)) ci = columnNames.index(k) ct[ci] = v else: raise Exception("%s %s should be int or string" % (k, type(k))) columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]" parse_params = { 'source_frames': source_framesStr, 'destination_frame': setup_result['destination_frame'], 'parse_type': setup_result['parse_type'], 'separator': setup_result['separator'], 'single_quotes': setup_result['single_quotes'], 'check_header': setup_result['check_header'], 'number_columns': setup_result['number_columns'], 'column_names': columnNamesStr, 'column_types': columnTypesStr, 'na_strings': naStrings, 'chunk_size': setup_result['chunk_size'], # No longer supported? how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['column_names'] and len( setup_result['column_names']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print( "Not printing the parameters to Parse because the columnNames are too lengthy." ) h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs( parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse source_frames is length:", len(parse_params['source_frames']) # This can be null now? parseSetup doesn't return default colnames? # print "parse column_names is length:", len(parse_params['column_names']) # none of the kwargs passed to here! parse_result = self.do_json_request(jsonRequest="3/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['destination_frame'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
#!/usr/bin/python import sys sys.path.extend(['.','..','py']) import h2o_sandbox print "Will look at all the files in ./sandbox assuming they are stdout/stderr log files" h2o_sandbox.check_sandbox_for_errors(pattern='*')
def sh2junit(name='NoName', cmd_string='/bin/ls', timeout=300, shdir=None, **kwargs): # split by arbitrary strings of whitespace characters (space, tab, newline, return, formfeed) print "cmd_string:", cmd_string cmdList = cmd_string.split() # these are absolute paths outfd, outpath = sandbox_tmp_file(prefix=name + '.stdout.', suffix='.log') errfd, errpath = sandbox_tmp_file(prefix=name + '.stderr.', suffix='.log') # make outpath and errpath full paths, so we can redirect print "outpath:", outpath print "errpath:", errpath start = time.time() print "psutil.Popen:", cmdList, outpath, errpath import subprocess # start the process in the target dir, if desired if shdir: currentDir = os.getcwd() os.chdir(shdir) ps = psutil.Popen(cmdList, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) if shdir: os.chdir(currentDir) comment = 'PID %d, stdout %s, stderr %s' % ( ps.pid, os.path.basename(outpath), os.path.basename(errpath)) print "spawn_cmd", cmd_string, comment # Reads the subprocess stdout until it is closed and # ...echo it our python stdout and also the R stdout file in sandbox # Then wait for the program to exit. # Read before wait so that you don't risk the pipe filling up and hanging the program. # You wait after read for the final program exit and return code. # If you don't wait, you'll get a zombie process (at least on linux) # this might not do what we want..see: # http://stackoverflow.com/questions/2804543/read-subprocess-stdout-line-by-line # I suppose we'll stop early? # shouldn't need a delay before checking this? if not ps.is_running(): raise Exception("sh2junit: not immediate ps.is_running after start") # Until we get the rc, it can be a zombie process. # A zombie process is not a real process. # it's just a remaining entry in the process table until the parent process requests the child's return code. # The actual process has ended and requires no other resources but said process table entry. linesMayExist = True errors = 0 timeoutError = False while linesMayExist: # get whatever accumulated, up to nothing returned # only do up to 20 lines before we check timeout again linesMayExist = ps.is_running() and not ps.status() == psutil.STATUS_ZOMBIE lineBurstCnt = 0 # stdout from subprocess line = ps.stdout.readline() # R apparently uses stderr a lot, so want to mix that in. We don't grab it until we hit a stall in R stdout though. while line: lineBurstCnt += 1 # maybe I should use p.communicate() instead. have to keep it to stdout? or do stdout+stderr here sys.stdout.write("R->" + line) # to our python stdout, with a prefix so it's obviously from R os.write(outfd, line) # to sandbox R stdout elapsed = time.time() - start if elapsed > timeout: timeoutError = True errors += 1 print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout) #kill R subprocess but don't kill me terminate_process_tree(ps.pid, including_parent=False) break line = ps.stdout.readline() if timeoutError: print "\n\n\nERROR: timeout" break # stderr from subprocess line = ps.stderr.readline() while line: lineBurstCnt += 1 sys.stdout.write("Re->" + line) # to our python stdout, with a prefix so it's obviously from R stderr os.write(errfd, line) # to sandbox R stderr line = ps.stderr.readline() print "lineBurstCnt:", lineBurstCnt # Check. may have flipped to not running, and we just got the last bit. # shouldn't be a race on a transition here, if ps.wait(0) completion syncs the transition if linesMayExist: print "ps.is_running():", ps.is_running(), ps.pid, ps.name, ps.status, ps.create_time # unload the return code without waiting..so we don't have a zombie! (lastrc, error) = rc_if_exists_and_done(ps) errors += error elapsed = time.time() - start # forever if timeout is None #if timeout and elapsed > timeout: if elapsed > timeout: timeoutError = True errors += 1 # we don't want to exception here, because we're going to print the xml that says there's an error # I guess we'll end up terminating the R process down below # could we have lines in stdout we didn't catch up on? maybe, but do we care? print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout) #kill R subprocess but don't kill me #terminate_process_tree(ps.pid, including_parent=False) break # wait for some more output to accumulate time.sleep(0.25) # It shouldn't be running now? # timeout=None waits forever. timeout=0 returns immediately. # default above is 5 minutes # Wait for process termination. Since child: return the exit code. # If the process is already terminated does not raise NoSuchProcess exception # but just return None immediately. # If timeout is specified and process is still alive raises psutil.TimeoutExpired() exception. # old # rc = ps.wait(timeout) (lastrc, error) = rc_if_exists_and_done(ps) errors += error elapsed = time.time() - start # Prune h2o logs to interesting lines and detect errors. # Error lines are returned. warning/info are printed to our (python stdout) # so that's always printed/saved? # None if no error sandboxErrorMessage = h2o_sandbox.check_sandbox_for_errors( LOG_DIR='./sandbox', python_test_name=name, cloudShutdownIsError=True, sandboxIgnoreErrors=True) # don't take exception on error if sandboxErrorMessage: errors += 1 out = file(outpath).read() err = file(errpath).read() create_junit_xml(name, out, err, sandboxErrorMessage, errors=errors, elapsed=elapsed) if not errors: return (errors, outpath, errpath) else: # dump all the info as part of the exception? maybe too much # is this bad to do in all cases? do we need it? hline = "\n===========================================BEGIN DUMP=============================================================\n" hhline = "\n===========================================END DUMP=============================================================\n" out = '[stdout->err]: '.join(out.splitlines(True)) err = '[sterr->err]: '.join(err.splitlines(True)) if ps.is_running(): print "Before terminate:", ps.pid, ps.is_running() terminate_process_tree(ps.pid, including_parent=True) if sandboxErrorMessage: print "\n\n\nError in Sandbox. Ending test. Dumping sub-process output.\n" print hline raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tErrors found in ./sandbox log files?.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % (name, cmd_string, lastrc, errors, out, err, hhline)) # could have already terminated? elif timeoutError: print "\n\n\nTimeout Error. Ending test. Dumping sub-process output.\n" print hline raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\ttimed out after %d secs. \nR stdout:\n%s\n\nR stderr:\n%s\n%s" % (name, cmd_string, lastrc, errors, timeout or 0, out, err, hhline)) else: print "\n\n\nCaught exception. Ending test. Dumping sub-process output.\n" print hline raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tLikely non-zero exit code from R.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % (name, cmd_string, lastrc, errors, out, err, hhline))
def sh2junit(name='NoName', cmd_string='/bin/ls', timeout=300, shdir=None, **kwargs): # split by arbitrary strings of whitespace characters (space, tab, newline, return, formfeed) print "cmd_string:", cmd_string cmdList = cmd_string.split() # these are absolute paths outfd, outpath = sandbox_tmp_file(prefix=name + '.stdout.', suffix='.log') errfd, errpath = sandbox_tmp_file(prefix=name + '.stderr.', suffix='.log') # make outpath and errpath full paths, so we can redirect print "outpath:", outpath print "errpath:", errpath start = time.time() print "psutil.Popen:", cmdList, outpath, errpath import subprocess # start the process in the target dir, if desired if shdir: currentDir = os.getcwd() os.chdir(shdir) ps = psutil.Popen(cmdList, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) if shdir: os.chdir(currentDir) comment = 'PID %d, stdout %s, stderr %s' % ( ps.pid, os.path.basename(outpath), os.path.basename(errpath)) print "spawn_cmd", cmd_string, comment # Reads the subprocess stdout until it is closed and # ...echo it our python stdout and also the R stdout file in sandbox # Then wait for the program to exit. # Read before wait so that you don't risk the pipe filling up and hanging the program. # You wait after read for the final program exit and return code. # If you don't wait, you'll get a zombie process (at least on linux) # this might not do what we want..see: # http://stackoverflow.com/questions/2804543/read-subprocess-stdout-line-by-line # I suppose we'll stop early? # shouldn't need a delay before checking this? if not ps.is_running(): raise Exception("sh2junit: not immediate ps.is_running after start") # Until we get the rc, it can be a zombie process. # A zombie process is not a real process. # it's just a remaining entry in the process table until the parent process requests the child's return code. # The actual process has ended and requires no other resources but said process table entry. linesMayExist = True errors = 0 timeoutError = False while linesMayExist: # get whatever accumulated, up to nothing returned # only do up to 20 lines before we check timeout again # why was R processes not completing on centos? # linesMayExist = ps.is_running() and not ps.status() == psutil.STATUS_ZOMBIE linesMayExist = ps.is_running() lineBurstCnt = 0 # stdout from subprocess line = ps.stdout.readline() # R apparently uses stderr a lot, so want to mix that in. We don't grab it until we hit a stall in R stdout though. while line: lineBurstCnt += 1 # maybe I should use p.communicate() instead. have to keep it to stdout? or do stdout+stderr here sys.stdout.write("R->" + line) # to our python stdout, with a prefix so it's obviously from R sys.stdout.flush() os.write(outfd, line) # to sandbox R stdout elapsed = time.time() - start if elapsed > timeout: timeoutError = True errors += 1 print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout) #kill R subprocess but don't kill me terminate_process_tree(ps.pid, including_parent=False) break line = ps.stdout.readline() if timeoutError: print "\n\n\nERROR: timeout" break # stderr from subprocess line = ps.stderr.readline() while line: lineBurstCnt += 1 sys.stdout.write("Re->" + line) # to our python stdout, with a prefix so it's obviously from R stderr sys.stdout.flush() os.write(errfd, line) # to sandbox R stderr line = ps.stderr.readline() print "lineBurstCnt:", lineBurstCnt # Check. may have flipped to not running, and we just got the last bit. # shouldn't be a race on a transition here, if ps.wait(0) completion syncs the transition if linesMayExist: print "ps.is_running():", ps.is_running(), ps.pid, ps.name, ps.status, ps.create_time # unload the return code without waiting..so we don't have a zombie! (lastrc, error) = rc_if_exists_and_done(ps) errors += error elapsed = time.time() - start # forever if timeout is None #if timeout and elapsed > timeout: if elapsed > timeout: timeoutError = True errors += 1 # we don't want to exception here, because we're going to print the xml that says there's an error # I guess we'll end up terminating the R process down below # could we have lines in stdout we didn't catch up on? maybe, but do we care? print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout) #kill R subprocess but don't kill me #terminate_process_tree(ps.pid, including_parent=False) break # wait for some more output to accumulate time.sleep(0.25) # It shouldn't be running now? # timeout=None waits forever. timeout=0 returns immediately. # default above is 5 minutes # Wait for process termination. Since child: return the exit code. # If the process is already terminated does not raise NoSuchProcess exception # but just return None immediately. # If timeout is specified and process is still alive raises psutil.TimeoutExpired() exception. # old # rc = ps.wait(timeout) (lastrc, error) = rc_if_exists_and_done(ps) errors += error elapsed = time.time() - start # Prune h2o logs to interesting lines and detect errors. # Error lines are returned. warning/info are printed to our (python stdout) # so that's always printed/saved? # None if no error sandboxErrorMessage = h2o_sandbox.check_sandbox_for_errors( LOG_DIR='./sandbox', python_test_name=name, cloudShutdownIsError=True, sandboxIgnoreErrors=True) # don't take exception on error if sandboxErrorMessage: errors += 1 out = file(outpath).read() err = file(errpath).read() create_junit_xml(name, out, err, sandboxErrorMessage, errors=errors, elapsed=elapsed) if not errors: return (errors, outpath, errpath) else: # dump all the info as part of the exception? maybe too much # is this bad to do in all cases? do we need it? hline = "\n===========================================BEGIN DUMP=============================================================\n" hhline = "\n===========================================END DUMP=============================================================\n" out = '[stdout->err]: '.join(out.splitlines(True)) err = '[sterr->err]: '.join(err.splitlines(True)) if ps.is_running(): print "Before terminate:", ps.pid, ps.is_running() terminate_process_tree(ps.pid, including_parent=True) if sandboxErrorMessage: print "\n\n\nError in Sandbox. Ending test. Dumping sub-process output.\n" print hline raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tErrors found in ./sandbox log files?.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % (name, cmd_string, lastrc, errors, out, err, hhline)) # could have already terminated? elif timeoutError: print "\n\n\nTimeout Error. Ending test. Dumping sub-process output.\n" print hline raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\ttimed out after %d secs. \nR stdout:\n%s\n\nR stderr:\n%s\n%s" % (name, cmd_string, lastrc, errors, timeout or 0, out, err, hhline)) else: print "\n\n\nCaught exception. Ending test. Dumping sub-process output.\n" print hline raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tLikely non-zero exit code from R.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % (name, cmd_string, lastrc, errors, out, err, hhline))
def parse(self, key, hex_key=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'srcs': None, 'hex': hex_key, 'pType': None, # This is a list? 'sep': None, 'ncols': None, 'checkHeader': None, # how is this used 'singleQuotes': None, 'columnNames': None, # list? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception("key seems to be bad in parse. Should be list or string. %s" % key) srcs = "[" + ",".join(key) + "]" else: # what if None here srcs = "[" + key + "]" params_dict['srcs'] = srcs # merge kwargs into params_dict # =None overwrites params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?srcs=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'srcs': srcs} setup_result = self.do_json_request(jsonRequest="ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # and then Parse?srcs=<keys list> and params from the ParseSetup result # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON] if setup_result['srcs']: setupSrcs = "[" + ",".join([src['name'] for src in setup_result['srcs'] ]) + "]" else: setupSrcs = None # I suppose we need a way for parameters to parse() to override these if setup_result['columnNames']: ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]" else: ascii_column_names = None parse_params = { 'srcs': setupSrcs, 'hex': setup_result['hexName'], 'pType': setup_result['pType'], 'sep': setup_result['sep'], 'ncols': setup_result['ncols'], 'checkHeader': setup_result['checkHeader'], 'singleQuotes': setup_result['singleQuotes'], 'columnNames': ascii_column_names, # how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['columnNames'] and len(setup_result['columnNames']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse srcs is length:", len(parse_params['srcs']) print "parse columnNames is length:", len(parse_params['columnNames']) # none of the kwargs passed to here! parse_result = self.do_json_request( jsonRequest="Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['name'] hex_key = parse_params['hex'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() return this.jobs(job_key) # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def parse(self, key, hex_key=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'srcs': None, 'hex': hex_key, 'pType': None, # This is a list? 'sep': None, 'ncols': None, 'checkHeader': None, # how is this used 'singleQuotes': None, 'columnNames': None, # list? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception( "key seems to be bad in parse. Should be list or string. %s" % key) srcs = "[" + ",".join(key) + "]" else: # what if None here srcs = "[" + key + "]" params_dict['srcs'] = srcs # merge kwargs into params_dict # =None overwrites params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?srcs=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'srcs': srcs} setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # and then Parse?srcs=<keys list> and params from the ParseSetup result # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON] if setup_result['srcs']: setupSrcs = "[" + ",".join( [src['name'] for src in setup_result['srcs']]) + "]" else: setupSrcs = None # I suppose we need a way for parameters to parse() to override these if setup_result['columnNames']: ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]" else: ascii_column_names = None parse_params = { 'srcs': setupSrcs, 'hex': setup_result['hexName'], 'pType': setup_result['pType'], 'sep': setup_result['sep'], 'ncols': setup_result['ncols'], 'checkHeader': setup_result['checkHeader'], 'singleQuotes': setup_result['singleQuotes'], 'columnNames': ascii_column_names, # how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['columnNames'] and len( setup_result['columnNames']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print( "Not printing the parameters to Parse because the columnNames are too lengthy." ) h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs( parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse srcs is length:", len(parse_params['srcs']) print "parse columnNames is length:", len(parse_params['columnNames']) # none of the kwargs passed to here! parse_result = self.do_json_request(jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['hex'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def parse(self, key, hex_key=None, columnTypeDict=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'source_keys': None, 'destination_key': hex_key, 'parse_type': None, # file type 'separator': None, 'single_quotes': None, 'check_header': None, # forces first line to be seen as column names 'number_columns': None, 'column_names': None, # a list 'column_types': None, # a list. or can use columnTypeDict param (see below) 'na_strings' : None, # a list 'chunk_size': None, # are these two no longer supported? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception("key seems to be bad in parse. Should be list or string. %s" % key) # have to put quotes around the individual list items source_keys = "[" + ",".join(map((lambda x: "'" + x + "'"), key)) + "]" else: # what if None here source_keys = "['" + key + "']" # quotes required on key params_dict['source_keys'] = source_keys # merge kwargs into params_dict # =None overwrites params_dict # columnTypeDict not used here h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?source_keys=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'source_keys': source_keys} setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # this should match what we gave as input? if setup_result['source_keys']: # should these be quoted? source_keysStr = "[" + ",".join([("'%s'" % src['name']) for src in setup_result['source_keys'] ]) + "]" else: source_keysStr = None # I suppose we need a way for parameters to parse() to override these # should it be an array or a dict? if setup_result['column_names']: columnNamesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), setup_result['column_names'])) + "]" else: columnNamesStr = None columnTypes = setup_result['column_types'] assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes) if setup_result['na_strings']: naStrings = "[" + ",".join(map((lambda x: "'" + x + "'" if x != None else "''"), setup_result['na_strings'])) + "]" else: naStrings = None # dict parameter to update columnTypeDict? # but we don't pass columnNames like this? ct = setup_result['column_types'] if columnTypeDict: for k,v in columnTypeDict.iteritems(): if isinstance(k, int): # if a column index if k>=0 and k<len(ct): ct[k] = v else: raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict)) # if a column name elif isinstance(k, basestring): # find the index if k not in columnNames: raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames)) ci = columnNames.index(k) ct[ci] = v else: raise Exception("%s %s should be int or string" % (k, type(k))) columnTypesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), ct)) + "]" parse_params = { 'source_keys': source_keysStr, 'destination_key': setup_result['destination_key'], 'parse_type': setup_result['parse_type'], 'separator': setup_result['separator'], 'single_quotes': setup_result['single_quotes'], 'check_header': setup_result['check_header'], 'number_columns': setup_result['number_columns'], 'column_names': columnNamesStr, 'column_types': columnTypesStr, 'na_strings': naStrings, 'chunk_size': setup_result['chunk_size'], # No longer supported? how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse source_keys is length:", len(parse_params['source_keys']) # This can be null now? parseSetup doesn't return default colnames? # print "parse column_names is length:", len(parse_params['column_names']) # none of the kwargs passed to here! parse_result = self.do_json_request( jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['destination_key'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")