class H2O(object): # static (class) variables ipaddr_from_cmd_line = None debugger = False json_url_history = [] python_test_name = inspect.stack()[1][1] verbose = False experimental_algos = ["pca", "svd", "glrm"] ## TODO: support api_version parameter for all api calls! # Also a global in the H2O object set at creation time. # TODO: ensure that all of this is really necessary: def __init__(self, use_this_ip_addr=None, port=54321, capture_output=True, use_debugger=None, classpath=None, use_hdfs=False, use_maprfs=False, # hdfs_version="cdh4", hdfs_name_node="192.168.1.151", # hdfs_version="cdh3", hdfs_name_node="192.168.1.176", hdfs_version=None, hdfs_name_node=None, hdfs_config=None, aws_credentials=None, use_flatfile=False, java_heap_GB=None, java_heap_MB=None, java_extra_args=None, use_home_for_ice=False, node_id=None, username=None, random_udp_drop=False, redirect_import_folder_to_s3_path=None, redirect_import_folder_to_s3n_path=None, disable_h2o_log=False, enable_benchmark_log=False, h2o_remote_buckets_root=None, delete_keys_at_teardown=False, cloud_name=None, ): if use_hdfs: # see if we can touch a 0xdata machine try: # long timeout in ec2...bad a = requests.get('http://192.168.1.176:80', timeout=1) hdfs_0xdata_visible = True except: hdfs_0xdata_visible = False # different defaults, depending on where we're running if hdfs_name_node is None: if hdfs_0xdata_visible: hdfs_name_node = "192.168.1.176" else: # ec2 hdfs_name_node = "10.78.14.235:9000" if hdfs_version is None: if hdfs_0xdata_visible: hdfs_version = "cdh3" else: # ec2 hdfs_version = "0.20.2" self.redirect_import_folder_to_s3_path = redirect_import_folder_to_s3_path self.redirect_import_folder_to_s3n_path = redirect_import_folder_to_s3n_path self.aws_credentials = aws_credentials self.port = port # None is legal for self.addr. # means we won't give an ip to the jar when we start. # Or we can say use use_this_ip_addr=127.0.0.1, or the known address # if use_this_addr is None, use 127.0.0.1 for urls and json # Command line arg 'ipaddr_from_cmd_line' dominates: if H2O.ipaddr_from_cmd_line: self.addr = H2O.ipaddr_from_cmd_line else: self.addr = use_this_ip_addr if self.addr is not None: self.http_addr = self.addr else: self.http_addr = get_ip_address() # command line should always dominate for enabling if H2O.debugger: use_debugger = True self.use_debugger = use_debugger self.classpath = classpath self.capture_output = capture_output self.use_hdfs = use_hdfs self.use_maprfs = use_maprfs self.hdfs_name_node = hdfs_name_node self.hdfs_version = hdfs_version self.hdfs_config = hdfs_config self.use_flatfile = use_flatfile self.java_heap_GB = java_heap_GB self.java_heap_MB = java_heap_MB self.java_extra_args = java_extra_args self.use_home_for_ice = use_home_for_ice self.node_id = node_id if username: self.username = username else: self.username = getpass.getuser() # don't want multiple reports from tearDown and tearDownClass # have nodes[0] remember (0 always exists) self.sandbox_error_was_reported = False self.sandbox_ignore_errors = False self.random_udp_drop = random_udp_drop self.disable_h2o_log = disable_h2o_log # this dumps stats from tests, and perf stats while polling to benchmark.log self.enable_benchmark_log = enable_benchmark_log self.h2o_remote_buckets_root = h2o_remote_buckets_root self.delete_keys_at_teardown = delete_keys_at_teardown if cloud_name: self.cloud_name = cloud_name else: self.cloud_name = 'pytest-%s-%s' % (getpass.getuser(), os.getpid()) ''' Printable string representation of an H2O node object. ''' def __str__(self): return '%s - http://%s:%d/' % (type(self), self.http_addr, self.port) # TODO: UGH, move this. @staticmethod def verboseprint(*args, **kwargs): if H2O.verbose: for x in args: # so you don't have to create a single string print x, for x in kwargs: # so you don't have to create a single string print x, print sys.stdout.flush() def __url(self, loc, port=None): # always use the new api port if port is None: port = self.port if loc.startswith('/'): delim = '' else: delim = '/' u = 'http://%s:%d%s%s' % (self.http_addr, port, delim, loc) return u ''' Make a REST request to the h2o server and if succesful return a dict containing the JSON result. ''' # @profile def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, raiseIfNon200=True, **kwargs): H2O.verboseprint("__do_json_request, timeout: " + str(timeout)) # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.__url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params_serialized = params.copy() for k in params_serialized: if params_serialized[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' # The requests package takes array parameters and explodes them: ['f00', 'b4r'] becomes "f00,b4r". # NOTE: this handles 1D arrays only; if we need ND this needs to be recursive. # NOTE: we currently don't need to do this for GET, so that's not implemented. if postData is not None: munged_postData = {} for k, v in postData.iteritems(): if type(v) is list: if len(v) == 0: munged_postData[k] = '[]' else: first = True array_str = '[' for val in v: if not first: array_str += ', ' if val is None: array_str += 'null' elif isinstance(val, basestring): array_str += "\"" + str(val) + "\"" else: array_str += str(val) first = False array_str += ']' munged_postData[k] = array_str else: # not list: munged_postData[k] = v else: # None munged_postData = postData if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest("----------------------------------------------------------------------\n") if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(munged_postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=munged_postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. # (this is new/experimental) exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) H2O.check_sandbox_for_errors(python_test_name=H2O.python_test_name); log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise exc_info[1], None, exc_info[2] H2O.verboseprint("r: " + repr(r)) if raiseIfNon200 and 200 != r.status_code: print "JSON call returned non-200 status: ", url print "r.status_code: " + str(r.status_code) print "r.headers: " + repr(r.headers) print "r.text: " + r.text log_rest("") try: if r is None: log_rest("r is None") else: log_rest("HTTP status code: " + str(r.status_code)) # The following accesses to r.text were taking most of the runtime: log_text = False if log_text: if hasattr(r, 'text'): if r.text is None: log_rest("r.text is None") else: log_rest(r.text) else: log_rest("r does not have attr text") except Exception, e: # Paranoid exception catch. # Ignore logging exceptions in the case that the above error checking isn't sufficient. print "Caught exception from result logging: ", e, "; result: ", repr(r)
def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, raiseIfNon200=True, suppressErrorMsg=False, **kwargs): H2O.verboseprint("__do_json_request, timeout: " + str(timeout)) # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.__url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params_serialized = params.copy() for k in params_serialized: if params_serialized[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' # The requests package takes array parameters and explodes them: ['f00', 'b4r'] becomes "f00,b4r". # NOTE: this handles 1D arrays only; if we need ND this needs to be recursive. # NOTE: we currently don't need to do this for GET, so that's not implemented. if postData is not None: munged_postData = {} for k, v in postData.iteritems(): if type(v) is list: if len(v) == 0: munged_postData[k] = '[]' else: first = True array_str = '[' for val in v: if not first: array_str += ', ' if val is None: array_str += 'null' elif isinstance(val, basestring): array_str += "\"" + str(val) + "\"" else: array_str += str(val) first = False array_str += ']' munged_postData[k] = array_str elif type(v) is dict: if len(v) == 0: munged_postData[k] = '{}' else: first = True map_str = '{' for key, val in v.iteritems(): if not first: map_str += ', ' if val is None: map_str += "\"" + key + "\"" + ': null' elif isinstance(val, basestring): map_str += "\"" + str(key) + "\"" + ":" + "\"" + str(val) + "\"" else: map_str += "\"" + key + "\"" + ':' + str(val) first = False map_str += '}' munged_postData[k] = map_str else: # not list: munged_postData[k] = v else: # None munged_postData = postData # print("munged_postData: " + repr(munged_postData)) if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest("----------------------------------------------------------------------\n") if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(munged_postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=munged_postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception as e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. # (this is new/experimental) exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) H2O.check_sandbox_for_errors(python_test_name=H2O.python_test_name); log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise (exc_info[1], None, exc_info[2]) H2O.verboseprint("r: " + repr(r)) if 200 != r.status_code: pp = pprint.PrettyPrinter(indent=4) msg = "JSON call returned non-200 status: " + url json = r.json() if None != json and 'dev_msg' in json: msg += "\ndev_msg: " msg += str(json['dev_msg']) msg += "\nr.status_code: " + str(r.status_code) msg += "\nr.headers: " + repr(r.headers) if None == json: msg += '\nERROR: the error output from H2O is not JSON!' msg += "\nr.text: " + r.text else: msg += "\nr.json: " msg += pp.pformat(json) if raiseIfNon200: pass # we'll pass msg up with the exception elif not suppressErrorMsg: print(msg) log_rest(msg) log_rest("") try: if r is None: log_rest("r is None") else: log_rest("HTTP status code: " + str(r.status_code)) # The following accesses to r.text were taking most of the runtime: log_text = False if log_text: if hasattr(r, 'text'): if r.text is None: log_rest("r.text is None") else: log_rest(r.text) else: log_rest("r does not have attr text") except Exception as e: # Paranoid exception catch. # Ignore logging exceptions in the case that the above error checking isn't sufficient. print("Caught exception from result logging: ", e, "; result: ", repr(r)) # fatal if no response if raiseIfNon200 and not r: raise Exception("Maybe bad url? no r in __do_json_request in %s:" % inspect.stack()[1][3] + "\n\n" + msg) # this is used to open a browser on results, or to redo the operation in the browser # we don't' have that may urls flying around, so let's keep them all H2O.json_url_history.append(r.url) # if r.json(): # raise Exception("Maybe bad url? no r.json in __do_json_request in %s:" % inspect.stack()[1][3]) rjson = None if returnFast: return try: rjson = r.json() except: print(h2o_test_utils.dump_json(r.text)) if not isinstance(r, (list, dict)): raise Exception("h2o json responses should always be lists or dicts, see previous for text") raise Exception("Could not decode any json from the request.") # TODO # TODO # TODO # TODO: we should really only look in the response object. This check # prevents us from having a field called "error" (e.g., for a scoring result). for e in ['error', 'Error', 'errors', 'Errors']: # error can be null (python None). This happens in exec2 if e in rjson and rjson[e]: H2O.verboseprint("rjson:" + h2o_test_utils.dump_json(rjson)) emsg = 'rjson %s in %s: %s' % (e, inspect.stack()[1][3], rjson[e]) if ignoreH2oError: # well, we print it..so not totally ignore. test can look at rjson returned print(emsg) else: print(emsg) raise Exception(emsg) for w in ['warning', 'Warning', 'warnings', 'Warnings']: # warning can be null (python None). if w in rjson and rjson[w]: H2O.verboseprint(dump_json(rjson)) print('rjson %s in %s: %s' % (w, inspect.stack()[1][3], rjson[w])) # Allow the caller to check things like __http_request.status_code. # The response object is not JSON-serializable, so we capture the fields we want here: response = {} # response['headers'] = r.headers response['url'] = r.url response['status_code'] = r.status_code response['text'] = r.text rjson['__http_response'] = response return rjson
def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, raiseIfNon200=True, **kwargs): H2O.verboseprint("__do_json_request, timeout: " + str(timeout)) # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.__url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params_serialized = params.copy() for k in params_serialized: if params_serialized[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' # The requests package takes array parameters and explodes them: ['f00', 'b4r'] becomes "f00,b4r". # NOTE: this handles 1D arrays only; if we need ND this needs to be recursive. # NOTE: we currently don't need to do this for GET, so that's not implemented. if postData is not None: munged_postData = {} for k, v in postData.iteritems(): if type(v) is list: if len(v) == 0: munged_postData[k] = '[]' else: first = True array_str = '[' for val in v: if not first: array_str += ', ' if val is None: array_str += 'null' elif isinstance(val, basestring): array_str += "\"" + str(val) + "\"" else: array_str += str(val) first = False array_str += ']' munged_postData[k] = array_str else: # not list: munged_postData[k] = v else: # None munged_postData = postData if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest("----------------------------------------------------------------------\n") if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(munged_postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=munged_postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. # (this is new/experimental) exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) H2O.check_sandbox_for_errors(python_test_name=H2O.python_test_name); log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise exc_info[1], None, exc_info[2] H2O.verboseprint("r: " + repr(r))