def get_hyperparams(self, id, display=True): """ Get the hyperparameters of a model explored by grid search. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A list of the hyperparameters for the specified model. """ idx = id if is_int(id) else self.model_ids.index(id) model = self[idx] # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) res = [ model.params[h]['actual'][0] if isinstance( model.params[h]['actual'], list) else model.params[h]['actual'] for h in self.hyper_params ] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return res
def get_hyperparams_dict(self, id, display=True): """ Derived and returned the model parameters used to train the particular grid search model. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A dict of model pararmeters derived from the hyper-parameters used to train this particular model. """ idx = id if is_int(id) else self.model_ids.index(id) model = self[idx] model_params = dict() # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) for param_name in self.hyper_names: model_params[param_name] = model.params[param_name]['actual'][0] if \ isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual'] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return model_params
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None): """ Categorical Interaction Feature Creation in H2O. Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by the user. Parameters ---------- data : H2OFrame the H2OFrame that holds the target categorical columns. factors : list factors Factor columns (either indices or column names). pairwise : bool Whether to create pairwise interactions between factors (otherwise create one higher-order interaction). Only applicable if there are 3 or more factors. max_factors : int Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all factor will be made) min_occurrence : int Min. occurrence threshold for factor levels in pair-wise interaction terms destination_frame : str A string indicating the destination key. If empty, this will be auto-generated by H2O. Returns ------- H2OFrame """ factors = [data.names[n] if is_int(n) else n for n in factors] parms = { "dest": py_tmp_key(append=h2oconn.session_id) if destination_frame is None else destination_frame, "source_frame": data.frame_id, "factor_columns": [quoted(f) for f in factors], "pairwise": pairwise, "max_factors": max_factors, "min_occurrence": min_occurrence, } H2OJob(api("POST /3/Interaction", data=parms), "Interactions").poll() return get_frame(parms["dest"])
def _get_type_name(types): """ Return the name of the provided type. >>> _get_type_name([int]) == "integer" >>> _get_type_name([str]) == "string" >>> _get_type_name([tuple]) == "tuple" >>> _get_type_name([Exception]) == "Exception" >>> _get_type_name((int, float, bool)) == "integer|float|bool" >>> _get_type_name((H2OFrame, None)) == "?H2OFrame" """ from h2o.utils.typechecks import is_str, is_int, U, I, numeric maybe_type = False res = [] for tt in types: if tt is None: maybe_type = True elif tt is str: res.append("string") elif tt is int: res.append("integer") elif tt is numeric: res.append("numeric") elif is_str(tt): res.append('"%s"' % repr(tt)[1:-1]) elif is_int(tt): res.append(str(tt)) elif isinstance(tt, U): res.append(H2OTypeError._get_type_name(tt)) elif isinstance(tt, I): res.append("&".join( H2OTypeError._get_type_name([tttt]) for tttt in tt)) elif isinstance(tt, type): res.append(tt.__name__) elif isinstance(tt, list): res.append("list(%s)" % H2OTypeError._get_type_name(tt)) elif isinstance(tt, set): res.append("set(%s)" % H2OTypeError._get_type_name(tt)) elif isinstance(tt, tuple): res.append("(%s)" % ", ".join( H2OTypeError._get_type_name([item]) for item in tt)) elif isinstance(tt, dict): res.append("dict(%s)" % ", ".join("%s: %s" % (H2OTypeError._get_type_name([tk]), H2OTypeError._get_type_name([tv])) for tk, tv in tt.items())) else: raise RuntimeError("Unexpected `tt`: %r" % tt) if maybe_type: if not res: return "None" res[0] = "?" + res[0] return "|".join(res)
def _get_type_name(types): """ Return the name of the provided type. >>> _get_type_name([int]) == "integer" >>> _get_type_name([str]) == "string" >>> _get_type_name([tuple]) == "tuple" >>> _get_type_name([Exception]) == "Exception" >>> _get_type_name((int, float, bool)) == "integer|float|bool" >>> _get_type_name((H2OFrame, None)) == "?H2OFrame" """ from h2o.utils.typechecks import is_str, is_int, U, I, numeric maybe_type = False res = [] for tt in types: if tt is None: maybe_type = True elif tt is str: res.append("string") elif tt is int: res.append("integer") elif tt is numeric: res.append("numeric") elif is_str(tt): res.append('"%s"' % repr(tt)[1:-1]) elif is_int(tt): res.append(str(tt)) elif isinstance(tt, U): res.append(H2OTypeError._get_type_name(tt)) elif isinstance(tt, I): res.append("&".join(H2OTypeError._get_type_name([tttt]) for tttt in tt)) elif isinstance(tt, type): res.append(tt.__name__) elif isinstance(tt, list): res.append("list(%s)" % H2OTypeError._get_type_name(tt)) elif isinstance(tt, set): res.append("set(%s)" % H2OTypeError._get_type_name(tt)) elif isinstance(tt, tuple): res.append("(%s)" % ", ".join(H2OTypeError._get_type_name([item]) for item in tt)) elif isinstance(tt, dict): res.append("dict(%s)" % ", ".join( "%s: %s" % (H2OTypeError._get_type_name([tk]), H2OTypeError._get_type_name([tv])) for tk, tv in tt.items() )) else: raise RuntimeError("Unexpected `tt`: %r" % tt) if maybe_type: if not res: return "None" res[0] = "?" + res[0] return "|".join(res)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_int(y): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not isinstance(x, (list, tuple)): x = [x] if is_int(x[0]): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if ignored_columns == [] else [ quoted(col) for col in ignored_columns ] kwargs["interactions"] = None if ( "interactions" not in kwargs or kwargs["interactions"] is None ) else [quoted(col) for col in kwargs["interactions"]] kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k])) for k in kwargs]) # gruesome one-liner rest_ver = kwargs.pop( "_rest_version") if "_rest_version" in kwargs else 3 algo = self._compute_algo() model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, algo), data=kwargs), job_type=(algo + " Model Build")) if self._future: self._job = model return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _add_agg(self, op, col, na): if op == "nrow": col = 0 if col is None: for i in range(self._fr.ncol): if i not in self._by: self._add_agg(op, i, na) return self elif is_str(col): cidx = self._fr.names.index(col) elif is_int(col): cidx = col elif is_listlike(col): for i in col: self._add_agg(op, i, na) return self else: raise ValueError("col must be a column name or index.") name = "{}_{}".format(op, self._fr.names[cidx]) self._aggs[name] = [op, cidx, na] return self
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None): """ Categorical Interaction Feature Creation in H2O. Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by the user. Parameters ---------- data : H2OFrame the H2OFrame that holds the target categorical columns. factors : list factors Factor columns (either indices or column names). pairwise : bool Whether to create pairwise interactions between factors (otherwise create one higher-order interaction). Only applicable if there are 3 or more factors. max_factors : int Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all factor will be made) min_occurrence : int Min. occurrence threshold for factor levels in pair-wise interaction terms destination_frame : str A string indicating the destination key. If empty, this will be auto-generated by H2O. Returns ------- H2OFrame """ factors = [data.names[n] if is_int(n) else n for n in factors] parms = {"dest": py_tmp_key(append=h2oconn.session_id) if destination_frame is None else destination_frame, "source_frame": data.frame_id, "factor_columns": [quoted(f) for f in factors], "pairwise": pairwise, "max_factors": max_factors, "min_occurrence": min_occurrence, } H2OJob(api("POST /3/Interaction", data=parms), "Interactions").poll() return get_frame(parms["dest"])
def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None, ice_root=None, port="54321+", verbose=True): """ Start new H2O server on the local machine. :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the locations returned by `._jar_paths()`. :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used. -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly. :param enable_assertions: If True, pass `-ea` option to the JVM. :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes. :param min_mem_size: Minimum heap size (jvm option Xms), in bytes. :param ice_root: A directory where H2O stores its temporary files. Default location is determined by tempfile.mkdtemp(). :param port: Port where to start the new server. This could be either an integer, or a string of the form "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up. :param verbose: If True, then connection info will be printed to the stdout. :returns: a new H2OLocalServer instance """ assert jar_path is None or is_str(jar_path), "`jar_path` should be string, got %s" % type(jar_path) assert jar_path is None or jar_path.endswith("h2o.jar"), \ "`jar_path` should be a path to an h2o.jar executable, got %s" % jar_path assert is_int(nthreads), "`nthreads` should be integer, got %s" % type(nthreads) assert nthreads == -1 or 1 <= nthreads <= 4096, "`nthreads` is out of bounds: %d" % nthreads assert isinstance(enable_assertions, bool), \ "`enable_assertions` should be bool, got %s" % type(enable_assertions) assert max_mem_size is None or is_int(max_mem_size), \ "`max_mem_size` should be integer, got %s" % type(max_mem_size) assert max_mem_size is None or max_mem_size >= 1 << 25, "`max_mem_size` too small: %d" % max_mem_size assert min_mem_size is None or is_int(min_mem_size), \ "`min_mem_size` should be integer, got %s" % type(min_mem_size) assert min_mem_size is None or max_mem_size is None or min_mem_size <= max_mem_size, \ "`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size) if ice_root: assert is_str(ice_root), "`ice_root` should be string, got %r" % type(ice_root) assert os.path.isdir(ice_root), "`ice_root` is not a valid directory: %s" % ice_root if port is None: port = "54321+" baseport = None if is_str(port): if port.isdigit(): port = int(port) else: assert port[-1] == "+" and port[:-1].isdigit(), \ "`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port baseport = int(port[:-1]) port = 0 assert is_int(port), "`port` should be integer (or string). Got: %s" % type(port) hs = H2OLocalServer() hs._verbose = bool(verbose) hs._jar_path = hs._find_jar(jar_path) hs._ice_root = ice_root if not ice_root: hs._ice_root = tempfile.mkdtemp() hs._tempdir = hs._ice_root if verbose: print("Attempting to start a local H2O server...") hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions, mmax=max_mem_size, mmin=min_mem_size) if verbose: print("Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port)) atexit.register(lambda: hs.shutdown()) return hs
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_int(y): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_listlike(x): x = [x] if is_int(x[0]): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [ quoted(col) for col in ignored_columns ] kwargs = dict([(k, kwargs[k].frame_id if isinstance( kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop( "_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() if rest_ver is not None: grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format( h_name, grid_json['failed_params'][error_index] [h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format( error_message, grid_json['failure_stack_traces'] [error_index])) error_index += 1 else: grid_json = h2o.api("GET /99/Grids/%s" % grid.dest_key) self.models = [ h2o.get_model(key['name']) for key in grid_json['model_ids'] ] # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api( "GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: raise ValueError( "Gridsearch returns no model due to bad parameter values or other reasons...." )
def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None, ice_root=None, port="54321+", verbose=True): """ Start new H2O server on the local machine. :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the locations returned by `._jar_paths()`. :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used. -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly. :param enable_assertions: If True, pass `-ea` option to the JVM. :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes. :param min_mem_size: Minimum heap size (jvm option Xms), in bytes. :param ice_root: A directory where H2O stores its temporary files. Default location is determined by tempfile.mkdtemp(). :param port: Port where to start the new server. This could be either an integer, or a string of the form "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up. :param verbose: If True, then connection info will be printed to the stdout. :returns: a new H2OLocalServer instance """ assert jar_path is None or is_str( jar_path), "`jar_path` should be string, got %s" % type(jar_path) assert jar_path is None or jar_path.endswith("h2o.jar"), \ "`jar_path` should be a path to an h2o.jar executable, got %s" % jar_path assert is_int( nthreads), "`nthreads` should be integer, got %s" % type(nthreads) assert nthreads == -1 or 1 <= nthreads <= 4096, "`nthreads` is out of bounds: %d" % nthreads assert isinstance(enable_assertions, bool), \ "`enable_assertions` should be bool, got %s" % type(enable_assertions) assert max_mem_size is None or is_int(max_mem_size), \ "`max_mem_size` should be integer, got %s" % type(max_mem_size) assert max_mem_size is None or max_mem_size >= 1 << 25, "`max_mem_size` too small: %d" % max_mem_size assert min_mem_size is None or is_int(min_mem_size), \ "`min_mem_size` should be integer, got %s" % type(min_mem_size) assert min_mem_size is None or max_mem_size is None or min_mem_size <= max_mem_size, \ "`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size) if ice_root: assert is_str( ice_root ), "`ice_root` should be string, got %r" % type(ice_root) assert os.path.isdir( ice_root), "`ice_root` is not a valid directory: %s" % ice_root if port is None: port = "54321+" baseport = None if is_str(port): if port.isdigit(): port = int(port) else: assert port[-1] == "+" and port[:-1].isdigit(), \ "`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port baseport = int(port[:-1]) port = 0 assert is_int( port), "`port` should be integer (or string). Got: %s" % type(port) hs = H2OLocalServer() hs._verbose = bool(verbose) hs._jar_path = hs._find_jar(jar_path) hs._ice_root = ice_root if not ice_root: hs._ice_root = tempfile.mkdtemp() hs._tempdir = hs._ice_root if verbose: print("Attempting to start a local H2O server...") hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions, mmax=max_mem_size, mmin=min_mem_size) if verbose: print(" Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port)) atexit.register(lambda: hs.shutdown()) return hs