def __init__(self, fr, by): """ Return a new ``GroupBy`` object using the H2OFrame specified in fr and the desired grouping columns specified in by. The original H2O frame will be stored as member _fr. Information on the new grouping of the original frame is described in a new H2OFrame in member frame. The returned groups are sorted by the natural group-by column sort. :param H2OFrame fr: H2OFrame that you want the group by operation to be performed on. :param by: can be a column name (str) or an index (int) of a single column, or a list for multiple columns denoting the set of columns to group by. """ self._fr = fr # IN self._by = by # IN self._aggs = {} # IN self._res = None # OUT if is_type(by, str): self._by = [self._fr.names.index(by)] elif is_type(by, list, tuple): self._by = [ self._fr.names.index(b) if is_type(b, str) else b for b in by ] else: self._by = [self._by]
def _arg_to_expr(arg): if arg is not None and isinstance(arg, range): arg = list(arg) if arg is None: return "[]" # empty list elif isinstance(arg, ExprNode): return arg._get_ast_str(False) elif isinstance(arg, ASTId): return str(arg) elif isinstance(arg, bool): return "{}".format("TRUE" if arg else "FALSE") elif is_type(arg, numeric): return "{}".format("NaN" if math.isnan(arg) else arg) elif is_type(arg, str): return '"' + arg + '"' elif isinstance(arg, slice): return "[{}:{}]".format( 0 if arg.start is None else arg.start, "NaN" if (arg.stop is None or math.isnan(arg.stop)) else (arg.stop) if arg.start is None else (arg.stop - arg.start)) elif isinstance(arg, list): if is_type(arg, [str]): return "[%s]" % " ".join('"%s"' % elem for elem in arg) else: return "[%s]" % " ".join( "NaN" if i == 'NaN' or math.isnan(i) else str(i) for i in arg) raise ValueError("Unexpected arg type: " + str(type(arg)) + " " + str(arg.__class__) + " " + arg.__repr__())
def __getitem__(self, item): if is_type(item, int, str): # single col selection returns list if is_type(item, int): index = item if index < 0: index += len(self._col_header) if index < 0 or index >= len(self._col_header): raise H2OValueError("Index %d is out of range" % item) else: if item in self._col_header: index = self._col_header.index(item) else: raise H2OValueError( "Column `%s` does not exist in the table" % item) return [row[index] for row in self._cell_values] elif isinstance(item, slice): # row selection if item is slice returns H2OTwoDimTable # FIXME! slice behavior should be consistent with other selectors - return columns instead of rows... self._cell_values = [ self._cell_values[ii] for ii in range(*item.indices(len(self._cell_values))) ] return self elif is_type(item, [int, str]): # multiple col selection returns list of cols return [self[i] for i in item] else: raise TypeError('can not support getting item for ' + str(item))
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_type(x, list, tuple): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) failure_messages_stacks = "" error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") # will raise error if no grid model is returned, store error messages here for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index])) failure_messages_stacks += error_message+'\n' error_index += 1 self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api("GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: if len(failure_messages_stacks)>0: raise ValueError(failure_messages_stacks) else: raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def assert_is_step(s): assert is_type( s, dict ), "each step must be a dict with an 'id' key and an optional 'weight' key" assert 'id' in s, "each step must have an 'id' key" assert len(s) == 1 or ('weight' in s and is_type( s['weight'], int)), "weight must be an integer" return True
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): """ Train the model synchronously (i.e. do not return until the model finishes training). To train asynchronously call :meth:`start`. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. """ algo_params = locals() parms = self._parms.copy() parms.update({k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"]}) # dictionaries have special handling in grid search, avoid the implicit conversion parms["search_criteria"] = None if self.search_criteria is None else str(self.search_criteria) parms["export_checkpoints_dir"] = self.export_checkpoints_dir parms["parallelism"] = self._parallelism parms["hyper_parameters"] = None if self.hyper_params is None else str(self.hyper_params) # unique to grid search parms.update({k: v for k, v in list(self.model._parms.items()) if v is not None}) # unique to grid search parms.update(params) if '__class__' in parms: # FIXME: hackt for PY3 del parms['__class__'] y = algo_params["y"] tframe = algo_params["training_frame"] if tframe is None: raise ValueError("Missing training_frame") if y is not None: if is_type(y, list, tuple): if len(y) == 1: parms["y"] = y[0] else: raise ValueError('y must be a single column reference') if x is None: if(isinstance(y, int)): xset = set(range(training_frame.ncols)) - {y} else: xset = set(training_frame.names) - {y} else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-training_frame.ncols <= xi < training_frame.ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(training_frame.names[xi]) else: if xi not in training_frame.names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["x"] = x self.build_model(parms)
def __validate_modeling_plan(self, modeling_plan): if modeling_plan is None: return None supported_aliases = ['all', 'defaults', 'grids'] def assert_is_step_def(sd): assert 'name' in sd, "each definition must have a 'name' key" assert 0 < len( sd ) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps" assert len( sd ) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps" assert 'alias' not in sd or sd[ 'alias'] in supported_aliases, "alias must be one of %s" % supported_aliases assert 'steps' not in sd or (is_type(sd['steps'], list) and all( assert_is_step(s) for s in sd['steps'])) def assert_is_step(s): assert is_type( s, dict ), "each step must be a dict with an 'id' key and optional keys among: weight, group" assert 'id' in s, "each step must have an 'id' key" assert len( s ) == 1 or 'weight' in s or 'group' in s, "steps support only the following keys: weight, group" assert 'weight' not in s or is_type( s['weight'], int), "weight must be an integer" assert 'group' not in s or is_type(s['group'], int), "group must be an integer" return True plan = [] for step_def in modeling_plan: assert_is_type(step_def, dict, tuple, str) if is_type(step_def, dict): assert_is_step_def(step_def) plan.append(step_def) elif is_type(step_def, str): plan.append(dict(name=step_def)) else: assert 0 < len(step_def) < 3 assert_is_type(step_def[0], str) name = step_def[0] if len(step_def) == 1: plan.append(dict(name=name)) else: assert_is_type(step_def[1], str, list) ids = step_def[1] if is_type(ids, str): assert_is_type(ids, *supported_aliases) plan.append(dict(name=name, alias=ids)) else: plan.append( dict(name=name, steps=[dict(id=i) for i in ids])) return plan
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): """ Train the model synchronously (i.e. do not return until the model finishes training). To train asynchronously call :meth:`start`. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. """ algo_params = locals() parms = self._parms.copy() parms.update({k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"]}) # dictionaries have special handling in grid search, avoid the implicit conversion parms["search_criteria"] = None if self.search_criteria is None else str(self.search_criteria) parms["hyper_parameters"] = None if self.hyper_params is None else str(self.hyper_params) # unique to grid search parms.update({k: v for k, v in list(self.model._parms.items()) if v is not None}) # unique to grid search parms.update(params) if '__class__' in parms: # FIXME: hackt for PY3 del parms['__class__'] y = algo_params["y"] tframe = algo_params["training_frame"] if tframe is None: raise ValueError("Missing training_frame") if y is not None: if is_type(y, list, tuple): if len(y) == 1: parms["y"] = y[0] else: raise ValueError('y must be a single column reference') if x is None: if(isinstance(y, int)): xset = set(range(training_frame.ncols)) - {y} else: xset = set(training_frame.names) - {y} else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-training_frame.ncols <= xi < training_frame.ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(training_frame.names[xi]) else: if xi not in training_frame.names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["x"] = x self.build_model(parms)
def __init__(self, fr, by): self._fr = fr # IN self._by = by # IN self._aggs = {} # IN self._res = None # OUT if is_type(by, str): self._by = [self._fr.names.index(by)] elif is_type(by, list, tuple): self._by = [self._fr.names.index(b) if is_type(b, str) else b for b in by] else: self._by = [self._by]
def assert_is_step(s): assert is_type( s, dict ), "each step must be a dict with an 'id' key and optional keys among: weight, group" assert 'id' in s, "each step must have an 'id' key" assert len( s ) == 1 or 'weight' in s or 'group' in s, "steps support only the following keys: weight, group" assert 'weight' not in s or is_type( s['weight'], int), "weight must be an integer" assert 'group' not in s or is_type(s['group'], int), "group must be an integer" return True
def base_models(self, base_models): def _get_id(something): if isinstance(something, Keyed): return something.key return something if not is_type(base_models, list): base_models = [base_models] if is_type(base_models, [H2OEstimator, H2OGridSearch, str]): base_models = [_get_id(b) for b in base_models] self._parms["base_models"] = base_models else: assert_is_type(base_models, None)
def __init__(self, fr, by): self._fr = fr # IN self._by = by # IN self._aggs = {} # IN self._res = None # OUT if is_type(by, str): self._by = [self._fr.names.index(by)] elif is_type(by, list, tuple): self._by = [ self._fr.names.index(b) if is_type(b, str) else b for b in by ] else: self._by = [self._by]
def get_hyperparams_dict(self, id, display=True): """ Derived and returned the model parameters used to train the particular grid search model. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A dict of model pararmeters derived from the hyper-parameters used to train this particular model. """ idx = id if is_type(id, int) else self.model_ids.index(id) model = self[idx] model_params = dict() # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) for param_name in self.hyper_names: model_params[param_name] = model.params[param_name]['actual'][0] if \ isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual'] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return model_params
def base_models(self, base_models): if is_type(base_models,[H2OEstimator]): base_models = [b.model_id for b in base_models] self._parms["base_models"] = base_models else: assert_is_type(base_models, None, [str]) self._parms["base_models"] = base_models
def __init__(self, model, hyper_params, grid_id=None, search_criteria=None, export_checkpoints_dir=None, parallelism=1): assert_is_type(model, None, H2OEstimator, lambda mdl: issubclass(mdl, H2OEstimator)) assert_is_type(hyper_params, dict) assert_is_type(grid_id, None, str) assert_is_type(search_criteria, None, dict) if not (model is None or is_type(model, H2OEstimator)): model = model() self._id = grid_id self.model = model self.hyper_params = dict(hyper_params) self.search_criteria = None if search_criteria is None else dict( search_criteria) self.export_checkpoints_dir = export_checkpoints_dir self._parallelism = parallelism # Degree of parallelism during model building self._grid_json = None self.models = None # list of H2O Estimator instances self._parms = {} # internal, for object recycle # self.parms = {} # external# self._future = False # used by __repr__/show to query job state# self._job = None # used when _future is True#
def __new__(cls, keyvals): # This method is called by the simplejson.json(object_pairs_hook=<this>) # `keyvals` is a list of (key,value) tuples. For example: # [("schema_version", 3), ("schema_name", "InitIDV3"), ("schema_type", "Iced")] schema = None for k, v in keyvals: if k == "__meta" and isinstance(v, dict): schema = v["schema_name"] break if k == "__schema" and is_type(v, str): schema = v break if schema == "MetadataV3": return H2OMetadataV3.make(keyvals) if schema == "CloudV3": return H2OCluster.make(keyvals) if schema == "H2OErrorV3": return H2OErrorV3.make(keyvals) if schema == "H2OModelBuilderErrorV3": return H2OModelBuilderErrorV3.make(keyvals) if schema == "TwoDimTableV3": return H2OTwoDimTable.make(keyvals) if schema == "ModelMetricsRegressionV3": return H2ORegressionModelMetrics.make(keyvals) if schema == "ModelMetricsClusteringV3": return H2OClusteringModelMetrics.make(keyvals) if schema == "ModelMetricsBinomialV3": return H2OBinomialModelMetrics.make(keyvals) if schema == "ModelMetricsBinomialUpliftV3": return H2OBinomialUpliftModelMetrics.make(keyvals) if schema == "ModelMetricsMultinomialV3": return H2OMultinomialModelMetrics.make(keyvals) if schema == "ModelMetricsOrdinalV3": return H2OOrdinalModelMetrics.make(keyvals) if schema == "ModelMetricsAutoEncoderV3": return H2OAutoEncoderModelMetrics.make(keyvals) return super(H2OResponse, cls).__new__(cls, keyvals)
def get_hyperparams(self, id, display=True): """ Get the hyperparameters of a model explored by grid search. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A list of the hyperparameters for the specified model. """ idx = id if is_type(id, int) else self.model_ids.index(id) model = self[idx] # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) res = [model.params[h]['actual'][0] if isinstance(model.params[h]['actual'], list) else model.params[h]['actual'] for h in self.hyper_params] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return res
def _handle_python_dicts(python_obj, check_header): header = list(python_obj.keys()) if python_obj else _gen_header(1) is_valid = all( re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header) # is this a valid header? if not is_valid: raise ValueError( "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ " ) for k in python_obj: # check that each value entry is a flat list/tuple or single int, float, or string v = python_obj[k] if isinstance( v, (tuple, list)): # if value is a tuple/list, then it must be flat if _is_list_of_lists(v): raise ValueError("Values in the dictionary must be flattened!") elif is_type(v, str, numeric): python_obj[k] = [v] else: raise ValueError( "Encountered invalid dictionary value when constructing H2OFrame. Got: {0}" .format(v)) zipper = getattr(itertools, "zip_longest", None) or getattr( itertools, "izip_longest", None) or zip rows = list(map(list, zipper(*list(python_obj.values())))) data_to_write = [dict(list(zip(header, row))) for row in rows] return header, data_to_write
def get_hyperparams_dict(self, id, display=True): """ Derived and returned the model parameters used to train the particular grid search model. :param str id: The model id of the model with hyperparameters of interest. :param bool display: Flag to indicate whether to display the hyperparameter names. :returns: A dict of model pararmeters derived from the hyper-parameters used to train this particular model. """ idx = id if is_type(id, int) else self.model_ids.index(id) model = self[idx] model_params = dict() # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) for param_name in self.hyper_names: model_params[param_name] = model.params[param_name]['actual'][0] if \ isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual'] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return model_params
def base_models(self, base_models): if is_type(base_models, [H2OEstimator]): base_models = [b.model_id for b in base_models] self._parms["base_models"] = base_models else: assert_is_type(base_models, None, [str]) self._parms["base_models"] = base_models
def _fetch_leaderboard(aml_id, extensions=None): assert_is_type(extensions, None, str, [str]) extensions = ([] if extensions is None else [extensions] if is_type(extensions, str) else extensions) resp = h2o.api("GET /99/Leaderboards/%s" % aml_id, data=dict(extensions=extensions)) dest_key = resp['project_name'].split('@', 1)[0] + "_custom_leaderboard" return _fetch_table(resp['table'], key=dest_key, progress_bar=False)
def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None, ice_root=None, port="54321+", verbose=True): """ Start new H2O server on the local machine. :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the locations returned by `._jar_paths()`. :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used. -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly. :param enable_assertions: If True, pass `-ea` option to the JVM. :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes. :param min_mem_size: Minimum heap size (jvm option Xms), in bytes. :param ice_root: A directory where H2O stores its temporary files. Default location is determined by tempfile.mkdtemp(). :param port: Port where to start the new server. This could be either an integer, or a string of the form "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up. :param verbose: If True, then connection info will be printed to the stdout. :returns: a new H2OLocalServer instance """ assert_is_type(jar_path, None, str) assert_is_type(port, None, int, str) assert_is_type(nthreads, -1, BoundInt(1, 4096)) assert_is_type(enable_assertions, bool) assert_is_type(min_mem_size, None, int) assert_is_type(max_mem_size, None, BoundInt(1 << 25)) assert_is_type(ice_root, None, I(str, os.path.isdir)) if jar_path: assert_satisfies(jar_path, jar_path.endswith("h2o.jar")) if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size: raise H2OValueError("`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size)) if port is None: port = "54321+" baseport = None # TODO: get rid of this port gimmick and have 2 separate parameters. if is_type(port, str): if port.isdigit(): port = int(port) else: if not(port[-1] == "+" and port[:-1].isdigit()): raise H2OValueError("`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port) baseport = int(port[:-1]) port = 0 hs = H2OLocalServer() hs._verbose = bool(verbose) hs._jar_path = hs._find_jar(jar_path) hs._ice_root = ice_root if not ice_root: hs._ice_root = tempfile.mkdtemp() hs._tempdir = hs._ice_root if verbose: print("Attempting to start a local H2O server...") hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions, mmax=max_mem_size, mmin=min_mem_size) if verbose: print(" Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port)) atexit.register(lambda: hs.shutdown()) return hs
def _add_agg(self, op, col, na): if op == "nrow": col = 0 if col is None: for i in range(self._fr.ncol): if i not in self._by: self._add_agg(op, i, na) return self elif is_type(col, str): cidx = self._fr.names.index(col) elif is_type(col, int): cidx = col elif is_type(col, list, tuple): for i in col: self._add_agg(op, i, na) return self else: raise ValueError("col must be a column name or index.") name = "{}_{}".format(op, self._fr.names[cidx]) self._aggs[name] = [op, cidx, na] return self
def metric(self, metric, thresholds=None): """ :param str metric: A metric among :const:`maximizing_metrics`. :param thresholds: thresholds parameter must be a number or a list (i.e. [0.01, 0.5, 0.99]). If None, then the threshold maximizing the metric will be used. If 'all', then all stored thresholds are used and returned with the matching metric. :returns: The set of metrics for the list of thresholds. The returned list has a 'value' property holding only the metric value (if no threshold provided or if provided as a number), or all the metric values (if thresholds provided as a list) """ assert_is_type(thresholds, None, 'all', numeric, [numeric]) if metric not in H2OBinomialModelMetrics.maximizing_metrics: raise ValueError("The only allowable metrics are {}".format( ', '.join(H2OBinomialModelMetrics.maximizing_metrics))) h2o_metric = (H2OBinomialModelMetrics.metrics_aliases[metric] if metric in H2OBinomialModelMetrics.metrics_aliases else metric) value_is_scalar = is_type(metric, str) and (thresholds is None or is_type(thresholds, numeric)) if thresholds is None: thresholds = [self.find_threshold_by_max_metric(h2o_metric)] elif thresholds == 'all': thresholds = None elif is_type(thresholds, numeric): thresholds = [thresholds] metrics = List() thresh2d = self._metric_json['thresholds_and_metric_scores'] if thresholds is None: # fast path to return all thresholds: skipping find_idx logic metrics.extend( list(t) for t in zip(thresh2d['threshold'], thresh2d[h2o_metric])) else: for t in thresholds: idx = self.find_idx_by_threshold(t) metrics.append([t, thresh2d[h2o_metric][idx]]) setattr( metrics, 'value', metrics[0][1] if value_is_scalar else list(r[1] for r in metrics)) return metrics
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs["training_frame"] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if ignored_columns == [] else [ quoted(col) for col in ignored_columns ] kwargs["interactions"] = (None if "interactions" not in kwargs or kwargs["interactions"] is None else [ quoted(col) for col in kwargs["interactions"] ]) kwargs = { k: H2OEstimator._keyify_if_h2oframe(kwargs[k]) for k in kwargs } rest_ver = kwargs.pop( "_rest_version") if "_rest_version" in kwargs else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=kwargs), job_type=(self.algo + " Model Build")) if self._future: self._job = model return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def assert_is_step_def(sd): assert 'name' in sd, "each definition must have a 'name' key" assert 0 < len( sd ) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps" assert len( sd ) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps" assert 'alias' not in sd or sd[ 'alias'] in supported_aliases, "alias must be one of %s" % supported_aliases assert 'steps' not in sd or (is_type(sd['steps'], list) and all( assert_is_step(s) for s in sd['steps']))
def train(self, x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): """ Train the model synchronously (i.e. do not return until the model finishes training). To train asynchronously call :meth:`start`. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. """ algo_params = locals() parms = self._parms.copy() parms.update({ k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"] }) parms["search_criteria"] = self.search_criteria parms["hyper_parameters"] = self.hyper_params # unique to grid search parms.update({ k: v for k, v in list(self.model._parms.items()) if v is not None }) # unique to grid search parms.update(params) if '__class__' in parms: # FIXME: hackt for PY3 del parms['__class__'] y = algo_params["y"] tframe = algo_params["training_frame"] if tframe is None: raise ValueError("Missing training_frame") if y is not None: if is_type(y, list, tuple): if len(y) == 1: parms["y"] = y[0] else: raise ValueError('y must be a single column reference') self._estimator_type = "classifier" if tframe[y].isfactor( ) else "regressor" self.build_model(parms)
def _log_message(self, msg): """ Log the message `msg` to the destination `self._logging_dest`. If this destination is a file name, then we append the message to the file and then close the file immediately. If the destination is an open file handle, then we simply write the message there and do not attempt to close it. """ if is_type(self._logging_dest, str): with open(self._logging_dest, "at", encoding="utf-8") as f: f.write(msg) else: self._logging_dest.write(msg)
def __init__(self, fr, by): """ Return a new ``GroupBy`` object using the H2OFrame specified in fr and the desired grouping columns specified in by. The original H2O frame will be stored as member _fr. Information on the new grouping of the original frame is described in a new H2OFrame in member frame. The returned groups are sorted by the natural group-by column sort. :param H2OFrame fr: H2OFrame that you want the group by operation to be performed on. :param by: can be a column name (str) or an index (int) of a single column, or a list for multiple columns denoting the set of columns to group by. """ self._fr = fr # IN self._by = by # IN self._aggs = {} # IN self._res = None # OUT if is_type(by, str): self._by = [self._fr.names.index(by)] elif is_type(by, list, tuple): self._by = [self._fr.names.index(b) if is_type(b, str) else b for b in by] else: self._by = [self._by]
def __getitem__(self, item): if is_type(item, int, str): # single col selection returns list if is_type(item, int): index = item if index < 0: index += len(self._col_header) if index < 0 or index >= len(self._col_header): raise H2OValueError("Index %d is out of range" % item) else: if item in self._col_header: index = self._col_header.index(item) else: raise H2OValueError("Column `%s` does not exist in the table" % item) return [row[index] for row in self._cell_values] elif isinstance(item, slice): # row selection if item is slice returns H2OTwoDimTable (slice works like pandas DateFrame, not like H2OFrame) new_table = copy.deepcopy(self) new_table._cell_values = [self._cell_values[ii] for ii in range(*item.indices(len(self._cell_values)))] return new_table elif is_type(item, [int, str]): # multiple col selection returns list of cols return [self[i] for i in item] else: raise TypeError('can not support getting item for ' + str(item))
def __validate_distribution(self, distribution): if is_type(distribution, str): distribution = distribution.lower() if distribution == "custom": raise H2OValueError( 'Distribution "custom" has to be specified as a ' 'dictionary with their respective parameters, e.g., ' '`dict(type = \"custom\", custom_distribution_func = \"...\"))`.' ) return distribution if is_type(distribution, dict): dist = distribution["type"].lower() allowed_distribution_parameters = dict( custom='custom_distribution_func', huber='huber_alpha', quantile='quantile_alpha', tweedie='tweedie_power') assert distribution.get( allowed_distribution_parameters.get(dist) ) is not None or len(distribution) == 1, ( "Distribution dictionary should contain distribution and a distribution " "parameter. For example `dict(type=\"{}\", {}=...)`.").format( dist, allowed_distribution_parameters[dist]) if distribution[ "type"] == "custom" and "custom_distribution_func" not in distribution.keys( ): raise H2OValueError( 'Distribution "custom" has to be specified as a ' 'dictionary with their respective parameters, e.g., ' '`dict(type = \"custom\", custom_distribution_func = \"...\"))`.' ) if allowed_distribution_parameters.get( dist) in distribution.keys(): setattr(self, "_" + allowed_distribution_parameters[dist], distribution[allowed_distribution_parameters[dist]]) return dist
def __getitem__(self, item): if is_type(item, int, str): # single col selection returns list if is_type(item, int): index = item if index < 0: index += len(self._col_header) if index < 0 or index >= len(self._col_header): raise H2OValueError("Index %d is out of range" % item) else: if item in self._col_header: index = self._col_header.index(item) else: raise H2OValueError("Column `%s` does not exist in the table" % item) return [row[index] for row in self._cell_values] elif isinstance(item, slice): # row selection if item is slice returns H2OTwoDimTable # FIXME! slice behavior should be consistent with other selectors - return columns instead of rows... self._cell_values = [self._cell_values[ii] for ii in range(*item.indices(len(self._cell_values)))] return self elif is_type(item, [int, str]): # multiple col selection returns list of cols return [self[i] for i in item] else: raise TypeError('can not support getting item for ' + str(item))
def __init__(self, widgets, title, file_mode): super(ProgressBarWidget, self).__init__() self._file_mode = file_mode self._width = min(self._get_terminal_size(), 100) self._encoding = (sys.stdout.encoding or "").lower() wlist = [] for widget in (widgets or [title + ":", PBWBar(), PBWPercentage()]): if is_type(widget, str): widget = PBWString(widget) widget.set_mode("file" if file_mode else "tty") widget.set_encoding(self._encoding) wlist.append(widget) self._to_render = None # Render this string on the next rendering cycle. Rarely used. self._widgets = tuple(wlist) self._widget_lengths = self._compute_widget_sizes() self._rendered = ""
def __init__(self, model, hyper_params, grid_id=None, search_criteria=None): super(H2OGridSearch, self).__init__() assert_is_type(model, None, H2OEstimator, lambda mdl: issubclass(mdl, H2OEstimator)) assert_is_type(hyper_params, dict) assert_is_type(grid_id, None, str) assert_is_type(search_criteria, None, dict) if not (model is None or is_type(model, H2OEstimator)): model = model() self._id = grid_id self.model = model self.hyper_params = dict(hyper_params) self.search_criteria = None if search_criteria is None else dict(search_criteria) self._grid_json = None self.models = None # list of H2O Estimator instances self._parms = {} # internal, for object recycle # self.parms = {} # external# self._future = False # used by __repr__/show to query job state# self._job = None # used when _future is True#
def manual_partial_dependence(model, datafile, xlist, xname, weightV): dataframe = h2o.import_file(pyunit_utils.locate(datafile)) meanV = [] stdV = [] stderrV = [] nRows = dataframe.nrow nCols = dataframe.ncol - 1 for xval in xlist: cons = [xval] * nRows if xname in dataframe.names: dataframe = dataframe.drop(xname) if not ((is_type(xval, str) and xval == 'NA') or (isinstance(xval, float) and math.isnan(xval))): dataframe = dataframe.cbind(h2o.H2OFrame(cons)) dataframe.set_name(nCols, xname) pred = model.predict(dataframe).as_data_frame(use_pandas=False, header=False) pIndex = len(pred[0]) - 1 sumEle = 0.0 sumEleSq = 0.0 sumWeight = 0.0 numNonZeroWeightCount = 0.0 m = 1.0 / math.sqrt(dataframe.nrow * 1.0) for rindex in range(len(pred)): val = float(pred[rindex][pIndex]) weight = weightV[rindex] if (abs(weight) > 0) and isinstance( val, float) and not (math.isnan(val)): temp = val * weight sumEle = sumEle + temp sumEleSq = sumEleSq + temp * val sumWeight = sumWeight + weight numNonZeroWeightCount = numNonZeroWeightCount + 1 wMean = sumEle / sumWeight scale = numNonZeroWeightCount * 1.0 / (numNonZeroWeightCount - 1) wSTD = math.sqrt((sumEleSq / sumWeight - wMean * wMean) * scale) meanV.append(wMean) stdV.append(wSTD) stderrV.append(wSTD * m) return meanV, stdV, stderrV
def _handle_python_dicts(python_obj, check_header): header = list(python_obj.keys()) is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header) # is this a valid header? if not is_valid: raise ValueError( "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ") for k in python_obj: # check that each value entry is a flat list/tuple or single int, float, or string v = python_obj[k] if isinstance(v, (tuple, list)): # if value is a tuple/list, then it must be flat if _is_list_of_lists(v): raise ValueError("Values in the dictionary must be flattened!") elif is_type(v, str, numeric): python_obj[k] = [v] else: raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v)) zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip rows = list(map(list, zipper(*list(python_obj.values())))) data_to_write = [dict(list(zip(header, row))) for row in rows] return header, data_to_write
def manual_partial_dependence(model, datafile, xlist, xname, weightV): dataframe = h2o.import_file(pyunit_utils.locate(datafile)) meanV = [] stdV = [] stderrV = [] nRows = dataframe.nrow nCols = dataframe.ncol-1 for xval in xlist: cons = [xval]*nRows if xname in dataframe.names: dataframe=dataframe.drop(xname) if not((is_type(xval, str) and xval=='NA') or (isinstance(xval, float) and math.isnan(xval))): dataframe = dataframe.cbind(h2o.H2OFrame(cons)) dataframe.set_name(nCols, xname) pred = model.predict(dataframe).as_data_frame(use_pandas=False, header=False) pIndex = len(pred[0])-1 sumEle = 0.0 sumEleSq = 0.0 sumWeight = 0.0 numNonZeroWeightCount = 0.0 m = 1.0/math.sqrt(dataframe.nrow*1.0) for rindex in range(len(pred)): val = float(pred[rindex][pIndex]); weight = weightV[rindex] if (abs(weight) > 0) and isinstance(val, float) and not(math.isnan(val)): temp = val*weight sumEle = sumEle+temp sumEleSq = sumEleSq+temp*val sumWeight = sumWeight+weight numNonZeroWeightCount = numNonZeroWeightCount+1 wMean = sumEle/sumWeight scale = numNonZeroWeightCount*1.0/(numNonZeroWeightCount-1) wSTD = math.sqrt((sumEleSq/sumWeight-wMean*wMean)*scale) meanV.append(wMean) stdV.append(wSTD) stderrV.append(wSTD*m) return meanV, stdV, stderrV
def __new__(cls, keyvals): # This method is called by the simplejson.json(object_pairs_hook=<this>) # `keyvals` is a list of (key,value) tuples. For example: # [("schema_version", 3), ("schema_name", "InitIDV3"), ("schema_type", "Iced")] schema = None for k, v in keyvals: if k == "__meta" and isinstance(v, dict): schema = v["schema_name"] break if k == "__schema" and is_type(v, str): schema = v break if schema == "CloudV3": return H2OCluster.from_kvs(keyvals) if schema == "H2OErrorV3": return H2OErrorV3(keyvals) if schema == "H2OModelBuilderErrorV3": return H2OModelBuilderErrorV3(keyvals) if schema == "TwoDimTableV3": return H2OTwoDimTable.make(keyvals) if schema == "ModelMetricsRegressionV3": return H2ORegressionModelMetrics.make(keyvals) if schema == "ModelMetricsClusteringV3": return H2OClusteringModelMetrics.make(keyvals) if schema == "ModelMetricsBinomialV3": return H2OBinomialModelMetrics.make(keyvals) if schema == "ModelMetricsMultinomialV3": return H2OMultinomialModelMetrics.make(keyvals) if schema == "ModelMetricsAutoEncoderV3": return H2OAutoEncoderModelMetrics.make(keyvals) return super(H2OResponse, cls).__new__(cls, keyvals)
def train(self, x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): # same api as estimator_base train algo_params = locals() parms = self._parms.copy() parms.update({ k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"] }) parms["search_criteria"] = self.search_criteria parms["hyper_parameters"] = self.hyper_params # unique to grid search parms.update({ k: v for k, v in list(self.model._parms.items()) if v is not None }) # unique to grid search parms.update(params) if '__class__' in parms: # FIXME: hackt for PY3 del parms['__class__'] y = algo_params["y"] tframe = algo_params["training_frame"] if tframe is None: raise ValueError("Missing training_frame") if y is not None: if is_type(y, list, tuple): if len(y) == 1: parms["y"] = y[0] else: raise ValueError('y must be a single column reference') self._estimator_type = "classifier" if tframe[y].isfactor( ) else "regressor" self.build_model(parms)
def train(self, x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): # same api as estimator_base train algo_params = locals() parms = self._parms.copy() parms.update({k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"]}) parms["search_criteria"] = self.search_criteria parms["hyper_parameters"] = self.hyper_params # unique to grid search parms.update({k: v for k, v in list(self.model._parms.items()) if v is not None}) # unique to grid search parms.update(params) if '__class__' in parms: # FIXME: hackt for PY3 del parms['__class__'] y = algo_params["y"] tframe = algo_params["training_frame"] if tframe is None: raise ValueError("Missing training_frame") if y is not None: if is_type(y, list, tuple): if len(y) == 1: parms["y"] = y[0] else: raise ValueError('y must be a single column reference') self._estimator_type = "classifier" if tframe[y].isfactor() else "regressor" self.build_model(parms)
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None): """ Train the H2O model. Parameters ---------- x : list, None A list of column names or indices indicating the predictor columns. y : An index or a column name indicating the response column. training_frame : H2OFrame The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). offset_column : str, optional The name or index of the column in training_frame that holds the offsets. fold_column : str, optional The name or index of the column in training_frame that holds the per-row fold assignments. weights_column : str, optional The name or index of the column in training_frame that holds the per-row weights. validation_frame : H2OFrame, optional H2OFrame with validation data to be scored on while training. max_runtime_secs : float Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) algo = self.algo parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}) ncols = training_frame.ncols names = training_frame.names if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" elif y is not None: raise H2OValueError("y should not be provided for an unsupervised model") assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 parms["training_frame"] = training_frame if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms), job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _is_num_list(l): return is_type(l, [numeric])
def _is_str_list(l): return is_type(l, [str])
def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None, ice_root=None, log_dir=None, log_level=None, port="54321+", name=None, extra_classpath=None, verbose=True, jvm_custom_args=None, bind_to_localhost=True): """ Start new H2O server on the local machine. :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the locations returned by `._jar_paths()`. :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used. -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly. :param enable_assertions: If True, pass `-ea` option to the JVM. :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes. :param min_mem_size: Minimum heap size (jvm option Xms), in bytes. :param log_dir: Directory for H2O logs to be stored if a new instance is started. Default directory is determined by H2O internally. :param log_level: The logger level for H2O if a new instance is started. :param ice_root: A directory where H2O stores its temporary files. Default location is determined by tempfile.mkdtemp(). :param port: Port where to start the new server. This could be either an integer, or a string of the form "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up. :param name: name of the h2o cluster to be started :param extra_classpath List of paths to libraries that should be included on the Java classpath. :param verbose: If True, then connection info will be printed to the stdout. :param jvm_custom_args Custom, user-defined arguments for the JVM H2O is instantiated in :param bind_to_localhost A flag indicating whether access to the H2O instance should be restricted to the local machine (default) or if it can be reached from other computers on the network. Only applicable when H2O is started from the Python client. :returns: a new H2OLocalServer instance """ assert_is_type(jar_path, None, str) assert_is_type(port, None, int, str) assert_is_type(name, None, str) assert_is_type(nthreads, -1, BoundInt(1, 4096)) assert_is_type(enable_assertions, bool) assert_is_type(min_mem_size, None, int) assert_is_type(max_mem_size, None, BoundInt(1 << 25)) assert_is_type(log_dir, str, None) assert_is_type(log_level, str, None) assert_satisfies(log_level, log_level in [None, "TRACE", "DEBUG", "INFO", "WARN", "ERRR", "FATA"]) assert_is_type(ice_root, None, I(str, os.path.isdir)) assert_is_type(extra_classpath, None, [str]) assert_is_type(jvm_custom_args, list, None) assert_is_type(bind_to_localhost, bool) if jar_path: assert_satisfies(jar_path, jar_path.endswith("h2o.jar")) if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size: raise H2OValueError("`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size)) if port is None: port = "54321+" baseport = None # TODO: get rid of this port gimmick and have 2 separate parameters. if is_type(port, str): if port.isdigit(): port = int(port) else: if not(port[-1] == "+" and port[:-1].isdigit()): raise H2OValueError("`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port) baseport = int(port[:-1]) port = 0 hs = H2OLocalServer() hs._verbose = bool(verbose) hs._jar_path = hs._find_jar(jar_path) hs._extra_classpath = extra_classpath hs._ice_root = ice_root hs._name = name if not ice_root: hs._ice_root = tempfile.mkdtemp() hs._tempdir = hs._ice_root if verbose: print("Attempting to start a local H2O server...") hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions, mmax=max_mem_size, mmin=min_mem_size, jvm_custom_args=jvm_custom_args, bind_to_localhost=bind_to_localhost, log_dir=log_dir, log_level=log_level) if verbose: print(" Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port)) atexit.register(lambda: hs.shutdown()) return hs
def train(self, x = None, y = None, training_frame = None, fold_column = None, weights_column = None, validation_frame = None, leaderboard_frame = None, blending_frame = None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ ncols = training_frame.ncols names = training_frame.names #Set project name if None if self.project_name is None: self.project_name = "automl_" + training_frame.frame_id self.build_control["project_name"] = self.project_name # Minimal required arguments are training_frame and y (response) if y is None: raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } if training_frame is None: raise ValueError('The training frame is not set!') else: assert_is_type(training_frame, H2OFrame) input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: assert_is_type(validation_frame, H2OFrame) input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: assert_is_type(leaderboard_frame, H2OFrame) input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if blending_frame is not None: assert_is_type(blending_frame, H2OFrame) input_spec['blending_frame'] = blending_frame.frame_id if self.sort_metric is not None: assert_is_type(self.sort_metric, str) sort_metric = self.sort_metric.lower() # Changed the API to use "deviance" to be consistent with stopping_metric values # TO DO: let's change the backend to use "deviance" since we use the term "deviance" # After that we can take this `if` statement out if sort_metric == "deviance": sort_metric = "mean_residual_deviance" input_spec['sort_metric'] = sort_metric if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None and fold_column in ignored_columns: ignored_columns.remove(fold_column) if weights_column is not None and weights_column in ignored_columns: ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec = input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control automl_build_params['build_models'] = self.build_models resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return self._job = H2OJob(resp['job'], "AutoML") self._job.poll() self._fetch()
def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given." % self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. :param bool verbose: Print scoring history to stdout. Defaults to False. """ assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def open(server=None, url=None, ip=None, port=None, https=None, auth=None, verify_ssl_certificates=True, proxy=None, cookies=None, verbose=True, _msgs=None): r""" Establish connection to an existing H2O server. The connection is not kept alive, so what this method actually does is it attempts to connect to the specified server, and checks that the server is healthy and responds to REST API requests. If the H2O server cannot be reached, an :class:`H2OConnectionError` will be raised. On success this method returns a new :class:`H2OConnection` object, and it is the only "official" way to create instances of this class. There are 3 ways to specify the target to connect to (these settings are mutually exclusive): * pass a ``server`` option, * pass the full ``url`` for the connection, * provide a triple of parameters ``ip``, ``port``, ``https``. :param H2OLocalServer server: connect to the specified local server instance. There is a slight difference between connecting to a local server by specifying its ip and address, and connecting through an H2OLocalServer instance: if the server becomes unresponsive, then having access to its process handle will allow us to query the server status through OS, and potentially provide snapshot of the server's error log in the exception information. :param url: full url of the server to connect to. :param ip: target server's IP address or hostname (default "localhost"). :param port: H2O server's port (default 54321). :param https: if True then connect using https instead of http (default False). :param verify_ssl_certificates: if False then SSL certificate checking will be disabled (default True). This setting should rarely be disabled, as it makes your connection vulnerable to man-in-the-middle attacks. When used, it will generate a warning from the requests library. Has no effect when ``https`` is False. :param auth: authentication token for connecting to the remote server. This can be either a (username, password) tuple, or an authenticator (AuthBase) object. Please refer to the documentation in the ``requests.auth`` module. :param proxy: url address of a proxy server. If you do not specify the proxy, then the requests module will attempt to use a proxy specified in the environment (in HTTP_PROXY / HTTPS_PROXY variables). We check for the presence of these variables and issue a warning if they are found. In order to suppress that warning and use proxy from the environment, pass ``proxy="(default)"``. :param cookies: Cookie (or list of) to add to requests :param verbose: if True, then connection progress info will be printed to the stdout. :param _msgs: custom messages to display during connection. This is a tuple (initial message, success message, failure message). :returns: A new :class:`H2OConnection` instance. :raises H2OConnectionError: if the server cannot be reached. :raises H2OServerError: if the server is in an unhealthy state (although this might be a recoverable error, the client itself should decide whether it wants to retry or not). """ if server is not None: assert_is_type(server, H2OLocalServer) assert_is_type(ip, None, "`ip` should be None when `server` parameter is supplied") assert_is_type(url, None, "`ip` should be None when `server` parameter is supplied") if not server.is_running(): raise H2OConnectionError("Unable to connect to server because it is not running") ip = server.ip port = server.port scheme = server.scheme context_path = '' elif url is not None: assert_is_type(url, str) assert_is_type(ip, None, "`ip` should be None when `url` parameter is supplied") # We don't allow any Unicode characters in the URL. Maybe some day we will... match = assert_matches(url, H2OConnection.url_pattern) scheme = match.group(1) ip = match.group(2) port = int(match.group(3)) context_path = '' if match.group(4) is None else "%s" % (match.group(4)) else: if ip is None: ip = str("localhost") if port is None: port = 54321 if https is None: https = False if is_type(port, str) and port.isdigit(): port = int(port) assert_is_type(ip, str) assert_is_type(port, int) assert_is_type(https, bool) assert_matches(ip, r"(?:[\w-]+\.)*[\w-]+") assert_satisfies(port, 1 <= port <= 65535) scheme = "https" if https else "http" context_path = '' if verify_ssl_certificates is None: verify_ssl_certificates = True assert_is_type(verify_ssl_certificates, bool) assert_is_type(proxy, str, None) assert_is_type(auth, AuthBase, (str, str), None) assert_is_type(cookies, str, [str], None) assert_is_type(_msgs, None, (str, str, str)) conn = H2OConnection() conn._verbose = bool(verbose) conn._local_server = server conn._base_url = "%s://%s:%d%s" % (scheme, ip, port, context_path) conn._verify_ssl_cert = bool(verify_ssl_certificates) conn._auth = auth conn._cookies = cookies conn._proxies = None if proxy and proxy != "(default)": conn._proxies = {scheme: proxy} elif not proxy: # Give user a warning if there are any "*_proxy" variables in the environment. [PUBDEV-2504] # To suppress the warning pass proxy = "(default)". for name in os.environ: if name.lower() == scheme + "_proxy": warn("Proxy is defined in the environment: %s. " "This may interfere with your H2O Connection." % name) try: retries = 20 if server else 5 conn._stage = 1 conn._timeout = 3.0 conn._cluster = conn._test_connection(retries, messages=_msgs) # If a server is unable to respond within 1s, it should be considered a bug. However we disable this # setting for now, for no good reason other than to ignore all those bugs :( conn._timeout = None # This is a good one! On the surface it registers a callback to be invoked when the script is about # to finish, but it also has a side effect in that the reference to current connection will be held # by the ``atexit`` service till the end -- which means it will never be garbage-collected. atexit.register(lambda: conn.close()) except Exception: # Reset _session_id so that we know the connection was not initialized properly. conn._stage = 0 raise return conn
def train(self, x = None, y = None, training_frame = None, fold_column = None, weights_column = None, validation_frame = None, leaderboard_frame = None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data to be scored on while training. Optional. This frame is used early stopping of individual models and early stopping of the grid searches (unless max_models or max_runtime_secs overrides metric-based early stopping). :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ ncols = training_frame.ncols names = training_frame.names # Minimal required arguments are training_frame and y (response) if y is None: raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } if training_frame is None: raise ValueError('The training frame is not set!') else: assert_is_type(training_frame, H2OFrame) input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None: ignored_columns = ignored_columns.remove(fold_column) if weights_column is not None: ignored_columns = ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec = input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control automl_build_params['build_models'] = self.build_models resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return self._job = H2OJob(resp['job'], "AutoML") self._automl_key = self._job.dest_key self._job.poll() self._fetch()