def read_cms(cms=None, domains=None): if cms is None: raise ValueError("Missing data, no `cms`.") if not isinstance(cms, list): raise ValueError("`cms` must be a list of lists") lol_all = all(is_listlike(l) for l in cms) if not lol_all: raise ValueError("`cms` must be a list of lists") return [ConfusionMatrix(cm, domains) for cm in cms]
def lazy_import(path): """ Import a single file or collection of files. :param path: A path to a data file (remote or local). """ if is_listlike(path): return [_import(p)[0] for p in path] else: assert_is_str(path) return _import(path)
def __init__(self, fr, by): self._fr = fr # IN self._by = by # IN self._aggs = {} # IN self._res = None # OUT if is_str(by): self._by = [self._fr.names.index(by)] elif is_listlike(by): self._by = [self._fr.names.index(b) if is_str(b) else b for b in by] else: self._by = [self._by]
def _add_agg(self, op, col, na): if op == "nrow": col = 0 if col is None: for i in range(self._fr.ncol): if i not in self._by: self._add_agg(op, i, na) return self elif is_str(col): cidx = self._fr.names.index(col) elif is_int(col): cidx = col elif is_listlike(col): for i in col: self._add_agg(op, i, na) return self else: raise ValueError("col must be a column name or index.") name = "{}_{}".format(op, self._fr.names[cidx]) self._aggs[name] = [op, cidx, na] return self
def train(self, x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): # same api as estimator_base train algo_params = locals() parms = self._parms.copy() parms.update({ k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"] }) parms["search_criteria"] = self.search_criteria parms["hyper_parameters"] = self.hyper_params # unique to grid search parms.update({ k: v for k, v in list(self.model._parms.items()) if v is not None }) # unique to grid search parms.update(params) if '__class__' in parms: # FIXME: hackt for PY3 del parms['__class__'] y = algo_params["y"] tframe = algo_params["training_frame"] if tframe is None: raise ValueError("Missing training_frame") if y is not None: if is_listlike(y): if len(y) == 1: parms["y"] = y[0] else: raise ValueError('y must be a single column reference') self._estimator_type = "classifier" if tframe[y].isfactor( ) else "regressor" self.build_model(parms)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_int(y): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_listlike(x): x = [x] if is_int(x[0]): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [ quoted(col) for col in ignored_columns ] kwargs = dict([(k, kwargs[k].frame_id if isinstance( kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop( "_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() if rest_ver is not None: grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format( h_name, grid_json['failed_params'][error_index] [h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format( error_message, grid_json['failure_stack_traces'] [error_index])) error_index += 1 else: grid_json = h2o.api("GET /99/Grids/%s" % grid.dest_key) self.models = [ h2o.get_model(key['name']) for key in grid_json['model_ids'] ] # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api( "GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: raise ValueError( "Gridsearch returns no model due to bad parameter values or other reasons...." )