Exemplo n.º 1
0
    def __init__(self, fr, by):
        """
        Return a new ``GroupBy`` object using the H2OFrame specified in fr and the desired grouping columns
        specified in by.  The original H2O frame will be stored as member _fr.  Information on the new grouping
        of the original frame is described in a new H2OFrame in member frame.

        The returned groups are sorted by the natural group-by column sort.

        :param H2OFrame fr: H2OFrame that you want the group by operation to be performed on.
        :param by: can be a column name (str) or an index (int) of a single column,  or a list for multiple columns
            denoting the set of columns to group by.
        """
        self._fr = fr  # IN
        self._by = by  # IN
        self._aggs = {}  # IN
        self._res = None  # OUT

        if is_type(by, str):
            self._by = [self._fr.names.index(by)]
        elif is_type(by, list, tuple):
            self._by = [
                self._fr.names.index(b) if is_type(b, str) else b for b in by
            ]
        else:
            self._by = [self._by]
Exemplo n.º 2
0
 def _arg_to_expr(arg):
     if arg is not None and isinstance(arg, range): arg = list(arg)
     if arg is None:
         return "[]"  # empty list
     elif isinstance(arg, ExprNode):
         return arg._get_ast_str(False)
     elif isinstance(arg, ASTId):
         return str(arg)
     elif isinstance(arg, bool):
         return "{}".format("TRUE" if arg else "FALSE")
     elif is_type(arg, numeric):
         return "{}".format("NaN" if math.isnan(arg) else arg)
     elif is_type(arg, str):
         return '"' + arg + '"'
     elif isinstance(arg, slice):
         return "[{}:{}]".format(
             0 if arg.start is None else arg.start, "NaN" if
             (arg.stop is None or math.isnan(arg.stop)) else
             (arg.stop) if arg.start is None else (arg.stop - arg.start))
     elif isinstance(arg, list):
         if is_type(arg, [str]):
             return "[%s]" % " ".join('"%s"' % elem for elem in arg)
         else:
             return "[%s]" % " ".join(
                 "NaN" if i == 'NaN' or math.isnan(i) else str(i)
                 for i in arg)
     raise ValueError("Unexpected arg type: " + str(type(arg)) + " " +
                      str(arg.__class__) + " " + arg.__repr__())
Exemplo n.º 3
0
 def __getitem__(self, item):
     if is_type(item, int, str):
         # single col selection returns list
         if is_type(item, int):
             index = item
             if index < 0: index += len(self._col_header)
             if index < 0 or index >= len(self._col_header):
                 raise H2OValueError("Index %d is out of range" % item)
         else:
             if item in self._col_header:
                 index = self._col_header.index(item)
             else:
                 raise H2OValueError(
                     "Column `%s` does not exist in the table" % item)
         return [row[index] for row in self._cell_values]
     elif isinstance(item, slice):
         # row selection if item is slice returns H2OTwoDimTable
         # FIXME! slice behavior should be consistent with other selectors - return columns instead of rows...
         self._cell_values = [
             self._cell_values[ii]
             for ii in range(*item.indices(len(self._cell_values)))
         ]
         return self
     elif is_type(item, [int, str]):
         # multiple col selection returns list of cols
         return [self[i] for i in item]
     else:
         raise TypeError('can not support getting item for ' + str(item))
Exemplo n.º 4
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Exemplo n.º 5
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Exemplo n.º 6
0
 def assert_is_step(s):
     assert is_type(
         s, dict
     ), "each step must be a dict with an 'id' key and an optional 'weight' key"
     assert 'id' in s, "each step must have an 'id' key"
     assert len(s) == 1 or ('weight' in s and is_type(
         s['weight'], int)), "weight must be an integer"
     return True
Exemplo n.º 7
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
              validation_frame=None, **params):
        """
        Train the model synchronously (i.e. do not return until the model finishes training).

        To train asynchronously call :meth:`start`.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        """
        algo_params = locals()
        parms = self._parms.copy()
        parms.update({k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"]})
        # dictionaries have special handling in grid search, avoid the implicit conversion
        parms["search_criteria"] = None if self.search_criteria is None else str(self.search_criteria)
        parms["export_checkpoints_dir"] = self.export_checkpoints_dir
        parms["parallelism"] = self._parallelism
        parms["hyper_parameters"] = None if self.hyper_params  is None else str(self.hyper_params) # unique to grid search
        parms.update({k: v for k, v in list(self.model._parms.items()) if v is not None})  # unique to grid search
        parms.update(params)
        if '__class__' in parms:  # FIXME: hackt for PY3
            del parms['__class__']
        y = algo_params["y"]
        tframe = algo_params["training_frame"]
        if tframe is None: raise ValueError("Missing training_frame")
        if y is not None:
            if is_type(y, list, tuple):
                if len(y) == 1:
                    parms["y"] = y[0]
                else:
                    raise ValueError('y must be a single column reference')
        if x is None:
            if(isinstance(y, int)):
                xset = set(range(training_frame.ncols)) - {y}
            else:
                xset = set(training_frame.names) - {y}
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-training_frame.ncols <= xi < training_frame.ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(training_frame.names[xi])
                else:
                    if xi not in training_frame.names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)
        parms["x"] = x
        self.build_model(parms)
Exemplo n.º 8
0
    def __validate_modeling_plan(self, modeling_plan):
        if modeling_plan is None:
            return None

        supported_aliases = ['all', 'defaults', 'grids']

        def assert_is_step_def(sd):
            assert 'name' in sd, "each definition must have a 'name' key"
            assert 0 < len(
                sd
            ) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps"
            assert len(
                sd
            ) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps"
            assert 'alias' not in sd or sd[
                'alias'] in supported_aliases, "alias must be one of %s" % supported_aliases
            assert 'steps' not in sd or (is_type(sd['steps'], list) and all(
                assert_is_step(s) for s in sd['steps']))

        def assert_is_step(s):
            assert is_type(
                s, dict
            ), "each step must be a dict with an 'id' key and optional keys among: weight, group"
            assert 'id' in s, "each step must have an 'id' key"
            assert len(
                s
            ) == 1 or 'weight' in s or 'group' in s, "steps support only the following keys: weight, group"
            assert 'weight' not in s or is_type(
                s['weight'], int), "weight must be an integer"
            assert 'group' not in s or is_type(s['group'],
                                               int), "group must be an integer"
            return True

        plan = []
        for step_def in modeling_plan:
            assert_is_type(step_def, dict, tuple, str)
            if is_type(step_def, dict):
                assert_is_step_def(step_def)
                plan.append(step_def)
            elif is_type(step_def, str):
                plan.append(dict(name=step_def))
            else:
                assert 0 < len(step_def) < 3
                assert_is_type(step_def[0], str)
                name = step_def[0]
                if len(step_def) == 1:
                    plan.append(dict(name=name))
                else:
                    assert_is_type(step_def[1], str, list)
                    ids = step_def[1]
                    if is_type(ids, str):
                        assert_is_type(ids, *supported_aliases)
                        plan.append(dict(name=name, alias=ids))
                    else:
                        plan.append(
                            dict(name=name, steps=[dict(id=i) for i in ids]))
        return plan
Exemplo n.º 9
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
              validation_frame=None, **params):
        """
        Train the model synchronously (i.e. do not return until the model finishes training).

        To train asynchronously call :meth:`start`.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        """
        algo_params = locals()
        parms = self._parms.copy()
        parms.update({k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"]})
        # dictionaries have special handling in grid search, avoid the implicit conversion
        parms["search_criteria"] = None if self.search_criteria is None else str(self.search_criteria)
        parms["hyper_parameters"] = None if self.hyper_params  is None else str(self.hyper_params) # unique to grid search
        parms.update({k: v for k, v in list(self.model._parms.items()) if v is not None})  # unique to grid search
        parms.update(params)
        if '__class__' in parms:  # FIXME: hackt for PY3
            del parms['__class__']
        y = algo_params["y"]
        tframe = algo_params["training_frame"]
        if tframe is None: raise ValueError("Missing training_frame")
        if y is not None:
            if is_type(y, list, tuple):
                if len(y) == 1:
                    parms["y"] = y[0]
                else:
                    raise ValueError('y must be a single column reference')
        if x is None:
            if(isinstance(y, int)):
                xset = set(range(training_frame.ncols)) - {y}
            else:
                xset = set(training_frame.names) - {y}
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-training_frame.ncols <= xi < training_frame.ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(training_frame.names[xi])
                else:
                    if xi not in training_frame.names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)
        parms["x"] = x
        self.build_model(parms)
Exemplo n.º 10
0
    def __init__(self, fr, by):
        self._fr = fr  # IN
        self._by = by  # IN
        self._aggs = {}  # IN
        self._res = None  # OUT

        if is_type(by, str):
            self._by = [self._fr.names.index(by)]
        elif is_type(by, list, tuple):
            self._by = [self._fr.names.index(b) if is_type(b, str) else b for b in by]
        else:
            self._by = [self._by]
Exemplo n.º 11
0
 def assert_is_step(s):
     assert is_type(
         s, dict
     ), "each step must be a dict with an 'id' key and optional keys among: weight, group"
     assert 'id' in s, "each step must have an 'id' key"
     assert len(
         s
     ) == 1 or 'weight' in s or 'group' in s, "steps support only the following keys: weight, group"
     assert 'weight' not in s or is_type(
         s['weight'], int), "weight must be an integer"
     assert 'group' not in s or is_type(s['group'],
                                        int), "group must be an integer"
     return True
Exemplo n.º 12
0
    def base_models(self, base_models):
        def _get_id(something):
            if isinstance(something, Keyed):
                return something.key
            return something

        if not is_type(base_models, list):
            base_models = [base_models]
        if is_type(base_models, [H2OEstimator, H2OGridSearch, str]):
            base_models = [_get_id(b) for b in base_models]
            self._parms["base_models"] = base_models
        else:
            assert_is_type(base_models, None)
Exemplo n.º 13
0
    def __init__(self, fr, by):
        self._fr = fr  # IN
        self._by = by  # IN
        self._aggs = {}  # IN
        self._res = None  # OUT

        if is_type(by, str):
            self._by = [self._fr.names.index(by)]
        elif is_type(by, list, tuple):
            self._by = [
                self._fr.names.index(b) if is_type(b, str) else b for b in by
            ]
        else:
            self._by = [self._by]
Exemplo n.º 14
0
    def get_hyperparams_dict(self, id, display=True):
        """
        Derived and returned the model parameters used to train the particular grid search model.

        Parameters
        ----------
        id: str
          The model id of the model with hyperparameters of interest.
        display: boolean
          Flag to indicate whether to display the hyperparameter names.

        Returns
        -------
          A dict of model pararmeters derived from the hyper-parameters used to train this particular model.
        """
        idx = id if is_type(id, int) else self.model_ids.index(id)
        model = self[idx]

        model_params = dict()

        # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        for param_name in self.hyper_names:
            model_params[param_name] = model.params[param_name]['actual'][0] if \
                isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual']

        if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
        return model_params
Exemplo n.º 15
0
 def base_models(self, base_models):
      if is_type(base_models,[H2OEstimator]):
         base_models = [b.model_id for b in base_models]
         self._parms["base_models"] = base_models
      else:
         assert_is_type(base_models, None, [str])
         self._parms["base_models"] = base_models
Exemplo n.º 16
0
 def __init__(self,
              model,
              hyper_params,
              grid_id=None,
              search_criteria=None,
              export_checkpoints_dir=None,
              parallelism=1):
     assert_is_type(model, None, H2OEstimator,
                    lambda mdl: issubclass(mdl, H2OEstimator))
     assert_is_type(hyper_params, dict)
     assert_is_type(grid_id, None, str)
     assert_is_type(search_criteria, None, dict)
     if not (model is None or is_type(model, H2OEstimator)): model = model()
     self._id = grid_id
     self.model = model
     self.hyper_params = dict(hyper_params)
     self.search_criteria = None if search_criteria is None else dict(
         search_criteria)
     self.export_checkpoints_dir = export_checkpoints_dir
     self._parallelism = parallelism  # Degree of parallelism during model building
     self._grid_json = None
     self.models = None  # list of H2O Estimator instances
     self._parms = {}  # internal, for object recycle #
     self.parms = {}  # external#
     self._future = False  # used by __repr__/show to query job state#
     self._job = None  # used when _future is True#
Exemplo n.º 17
0
 def __new__(cls, keyvals):
     # This method is called by the simplejson.json(object_pairs_hook=<this>)
     # `keyvals` is a list of (key,value) tuples. For example:
     #    [("schema_version", 3), ("schema_name", "InitIDV3"), ("schema_type", "Iced")]
     schema = None
     for k, v in keyvals:
         if k == "__meta" and isinstance(v, dict):
             schema = v["schema_name"]
             break
         if k == "__schema" and is_type(v, str):
             schema = v
             break
     if schema == "MetadataV3": return H2OMetadataV3.make(keyvals)
     if schema == "CloudV3": return H2OCluster.make(keyvals)
     if schema == "H2OErrorV3": return H2OErrorV3.make(keyvals)
     if schema == "H2OModelBuilderErrorV3":
         return H2OModelBuilderErrorV3.make(keyvals)
     if schema == "TwoDimTableV3": return H2OTwoDimTable.make(keyvals)
     if schema == "ModelMetricsRegressionV3":
         return H2ORegressionModelMetrics.make(keyvals)
     if schema == "ModelMetricsClusteringV3":
         return H2OClusteringModelMetrics.make(keyvals)
     if schema == "ModelMetricsBinomialV3":
         return H2OBinomialModelMetrics.make(keyvals)
     if schema == "ModelMetricsBinomialUpliftV3":
         return H2OBinomialUpliftModelMetrics.make(keyvals)
     if schema == "ModelMetricsMultinomialV3":
         return H2OMultinomialModelMetrics.make(keyvals)
     if schema == "ModelMetricsOrdinalV3":
         return H2OOrdinalModelMetrics.make(keyvals)
     if schema == "ModelMetricsAutoEncoderV3":
         return H2OAutoEncoderModelMetrics.make(keyvals)
     return super(H2OResponse, cls).__new__(cls, keyvals)
Exemplo n.º 18
0
    def get_hyperparams(self, id, display=True):
        """
        Get the hyperparameters of a model explored by grid search.

        Parameters
        ----------
        id: str
          The model id of the model with hyperparameters of interest.
        display: boolean
          Flag to indicate whether to display the hyperparameter names.

        Returns
        -------
          A list of the hyperparameters for the specified model.
        """
        idx = id if is_type(id, int) else self.model_ids.index(id)
        model = self[idx]

        # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        res = [model.params[h]['actual'][0] if isinstance(model.params[h]['actual'], list)
               else model.params[h]['actual']
               for h in self.hyper_params]
        if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
        return res
Exemplo n.º 19
0
def _handle_python_dicts(python_obj, check_header):
    header = list(python_obj.keys()) if python_obj else _gen_header(1)
    is_valid = all(
        re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col)
        for col in header)  # is this a valid header?
    if not is_valid:
        raise ValueError(
            "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ "
        )
    for k in python_obj:  # check that each value entry is a flat list/tuple or single int, float, or string
        v = python_obj[k]
        if isinstance(
                v,
            (tuple, list)):  # if value is a tuple/list, then it must be flat
            if _is_list_of_lists(v):
                raise ValueError("Values in the dictionary must be flattened!")
        elif is_type(v, str, numeric):
            python_obj[k] = [v]
        else:
            raise ValueError(
                "Encountered invalid dictionary value when constructing H2OFrame. Got: {0}"
                .format(v))

    zipper = getattr(itertools, "zip_longest", None) or getattr(
        itertools, "izip_longest", None) or zip
    rows = list(map(list, zipper(*list(python_obj.values()))))
    data_to_write = [dict(list(zip(header, row))) for row in rows]
    return header, data_to_write
Exemplo n.º 20
0
    def get_hyperparams_dict(self, id, display=True):
        """
        Derived and returned the model parameters used to train the particular grid search model.

        :param str id: The model id of the model with hyperparameters of interest.
        :param bool display: Flag to indicate whether to display the hyperparameter names.

        :returns: A dict of model pararmeters derived from the hyper-parameters used to train this particular model.
        """
        idx = id if is_type(id, int) else self.model_ids.index(id)
        model = self[idx]

        model_params = dict()

        # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        for param_name in self.hyper_names:
            model_params[param_name] = model.params[param_name]['actual'][0] if \
                isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual']

        if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
        return model_params
Exemplo n.º 21
0
    def get_hyperparams(self, id, display=True):
        """
        Get the hyperparameters of a model explored by grid search.

        Parameters
        ----------
        id: str
          The model id of the model with hyperparameters of interest.
        display: boolean
          Flag to indicate whether to display the hyperparameter names.

        Returns
        -------
          A list of the hyperparameters for the specified model.
        """
        idx = id if is_type(id, int) else self.model_ids.index(id)
        model = self[idx]

        # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        res = [model.params[h]['actual'][0] if isinstance(model.params[h]['actual'], list)
               else model.params[h]['actual']
               for h in self.hyper_params]
        if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
        return res
Exemplo n.º 22
0
 def base_models(self, base_models):
     if is_type(base_models, [H2OEstimator]):
         base_models = [b.model_id for b in base_models]
         self._parms["base_models"] = base_models
     else:
         assert_is_type(base_models, None, [str])
         self._parms["base_models"] = base_models
Exemplo n.º 23
0
def _fetch_leaderboard(aml_id, extensions=None):
    assert_is_type(extensions, None, str, [str])
    extensions = ([] if extensions is None else
                  [extensions] if is_type(extensions, str) else extensions)
    resp = h2o.api("GET /99/Leaderboards/%s" % aml_id,
                   data=dict(extensions=extensions))
    dest_key = resp['project_name'].split('@', 1)[0] + "_custom_leaderboard"
    return _fetch_table(resp['table'], key=dest_key, progress_bar=False)
Exemplo n.º 24
0
    def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None,
              ice_root=None, port="54321+", verbose=True):
        """
        Start new H2O server on the local machine.

        :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the
            locations returned by `._jar_paths()`.
        :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used.
            -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly.
        :param enable_assertions: If True, pass `-ea` option to the JVM.
        :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes.
        :param min_mem_size: Minimum heap size (jvm option Xms), in bytes.
        :param ice_root: A directory where H2O stores its temporary files. Default location is determined by
            tempfile.mkdtemp().
        :param port: Port where to start the new server. This could be either an integer, or a string of the form
            "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up.
        :param verbose: If True, then connection info will be printed to the stdout.

        :returns: a new H2OLocalServer instance
        """
        assert_is_type(jar_path, None, str)
        assert_is_type(port, None, int, str)
        assert_is_type(nthreads, -1, BoundInt(1, 4096))
        assert_is_type(enable_assertions, bool)
        assert_is_type(min_mem_size, None, int)
        assert_is_type(max_mem_size, None, BoundInt(1 << 25))
        assert_is_type(ice_root, None, I(str, os.path.isdir))
        if jar_path:
            assert_satisfies(jar_path, jar_path.endswith("h2o.jar"))

        if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size:
            raise H2OValueError("`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size))
        if port is None: port = "54321+"
        baseport = None
        # TODO: get rid of this port gimmick and have 2 separate parameters.
        if is_type(port, str):
            if port.isdigit():
                port = int(port)
            else:
                if not(port[-1] == "+" and port[:-1].isdigit()):
                    raise H2OValueError("`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port)
                baseport = int(port[:-1])
                port = 0

        hs = H2OLocalServer()
        hs._verbose = bool(verbose)
        hs._jar_path = hs._find_jar(jar_path)
        hs._ice_root = ice_root
        if not ice_root:
            hs._ice_root = tempfile.mkdtemp()
            hs._tempdir = hs._ice_root

        if verbose: print("Attempting to start a local H2O server...")
        hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions,
                          mmax=max_mem_size, mmin=min_mem_size)
        if verbose: print("  Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port))
        atexit.register(lambda: hs.shutdown())
        return hs
Exemplo n.º 25
0
    def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None,
              ice_root=None, port="54321+", verbose=True):
        """
        Start new H2O server on the local machine.

        :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the
            locations returned by `._jar_paths()`.
        :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used.
            -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly.
        :param enable_assertions: If True, pass `-ea` option to the JVM.
        :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes.
        :param min_mem_size: Minimum heap size (jvm option Xms), in bytes.
        :param ice_root: A directory where H2O stores its temporary files. Default location is determined by
            tempfile.mkdtemp().
        :param port: Port where to start the new server. This could be either an integer, or a string of the form
            "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up.
        :param verbose: If True, then connection info will be printed to the stdout.

        :returns: a new H2OLocalServer instance
        """
        assert_is_type(jar_path, None, str)
        assert_is_type(port, None, int, str)
        assert_is_type(nthreads, -1, BoundInt(1, 4096))
        assert_is_type(enable_assertions, bool)
        assert_is_type(min_mem_size, None, int)
        assert_is_type(max_mem_size, None, BoundInt(1 << 25))
        assert_is_type(ice_root, None, I(str, os.path.isdir))
        if jar_path:
            assert_satisfies(jar_path, jar_path.endswith("h2o.jar"))

        if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size:
            raise H2OValueError("`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size))
        if port is None: port = "54321+"
        baseport = None
        # TODO: get rid of this port gimmick and have 2 separate parameters.
        if is_type(port, str):
            if port.isdigit():
                port = int(port)
            else:
                if not(port[-1] == "+" and port[:-1].isdigit()):
                    raise H2OValueError("`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port)
                baseport = int(port[:-1])
                port = 0

        hs = H2OLocalServer()
        hs._verbose = bool(verbose)
        hs._jar_path = hs._find_jar(jar_path)
        hs._ice_root = ice_root
        if not ice_root:
            hs._ice_root = tempfile.mkdtemp()
            hs._tempdir = hs._ice_root

        if verbose: print("Attempting to start a local H2O server...")
        hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions,
                          mmax=max_mem_size, mmin=min_mem_size)
        if verbose: print("  Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port))
        atexit.register(lambda: hs.shutdown())
        return hs
Exemplo n.º 26
0
 def _add_agg(self, op, col, na):
     if op == "nrow": col = 0
     if col is None:
         for i in range(self._fr.ncol):
             if i not in self._by: self._add_agg(op, i, na)
         return self
     elif is_type(col, str):
         cidx = self._fr.names.index(col)
     elif is_type(col, int):
         cidx = col
     elif is_type(col, list, tuple):
         for i in col:
             self._add_agg(op, i, na)
         return self
     else:
         raise ValueError("col must be a column name or index.")
     name = "{}_{}".format(op, self._fr.names[cidx])
     self._aggs[name] = [op, cidx, na]
     return self
Exemplo n.º 27
0
    def metric(self, metric, thresholds=None):
        """
        :param str metric: A metric among :const:`maximizing_metrics`.
        :param thresholds: thresholds parameter must be a number or a list (i.e. [0.01, 0.5, 0.99]).
            If None, then the threshold maximizing the metric will be used.
            If 'all', then all stored thresholds are used and returned with the matching metric.
        :returns: The set of metrics for the list of thresholds.
            The returned list has a 'value' property holding only
            the metric value (if no threshold provided or if provided as a number),
            or all the metric values (if thresholds provided as a list)
        """
        assert_is_type(thresholds, None, 'all', numeric, [numeric])
        if metric not in H2OBinomialModelMetrics.maximizing_metrics:
            raise ValueError("The only allowable metrics are {}".format(
                ', '.join(H2OBinomialModelMetrics.maximizing_metrics)))

        h2o_metric = (H2OBinomialModelMetrics.metrics_aliases[metric] if metric
                      in H2OBinomialModelMetrics.metrics_aliases else metric)
        value_is_scalar = is_type(metric,
                                  str) and (thresholds is None
                                            or is_type(thresholds, numeric))
        if thresholds is None:
            thresholds = [self.find_threshold_by_max_metric(h2o_metric)]
        elif thresholds == 'all':
            thresholds = None
        elif is_type(thresholds, numeric):
            thresholds = [thresholds]

        metrics = List()
        thresh2d = self._metric_json['thresholds_and_metric_scores']
        if thresholds is None:  # fast path to return all thresholds: skipping find_idx logic
            metrics.extend(
                list(t)
                for t in zip(thresh2d['threshold'], thresh2d[h2o_metric]))
        else:
            for t in thresholds:
                idx = self.find_idx_by_threshold(t)
                metrics.append([t, thresh2d[h2o_metric][idx]])

        setattr(
            metrics, 'value',
            metrics[0][1] if value_is_scalar else list(r[1] for r in metrics))
        return metrics
Exemplo n.º 28
0
 def _add_agg(self, op, col, na):
     if op == "nrow": col = 0
     if col is None:
         for i in range(self._fr.ncol):
             if i not in self._by: self._add_agg(op, i, na)
         return self
     elif is_type(col, str):
         cidx = self._fr.names.index(col)
     elif is_type(col, int):
         cidx = col
     elif is_type(col, list, tuple):
         for i in col:
             self._add_agg(op, i, na)
         return self
     else:
         raise ValueError("col must be a column name or index.")
     name = "{}_{}".format(op, self._fr.names[cidx])
     self._aggs[name] = [op, cidx, na]
     return self
Exemplo n.º 29
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs["training_frame"] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            quoted(col) for col in ignored_columns
        ]
        kwargs["interactions"] = (None if "interactions" not in kwargs
                                  or kwargs["interactions"] is None else [
                                      quoted(col)
                                      for col in kwargs["interactions"]
                                  ])
        kwargs = {
            k: H2OEstimator._keyify_if_h2oframe(kwargs[k])
            for k in kwargs
        }
        rest_ver = kwargs.pop(
            "_rest_version") if "_rest_version" in kwargs else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" %
                               (rest_ver, self.algo),
                               data=kwargs),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" %
                             (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Exemplo n.º 30
0
 def assert_is_step_def(sd):
     assert 'name' in sd, "each definition must have a 'name' key"
     assert 0 < len(
         sd
     ) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps"
     assert len(
         sd
     ) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps"
     assert 'alias' not in sd or sd[
         'alias'] in supported_aliases, "alias must be one of %s" % supported_aliases
     assert 'steps' not in sd or (is_type(sd['steps'], list) and all(
         assert_is_step(s) for s in sd['steps']))
Exemplo n.º 31
0
    def train(self,
              x,
              y=None,
              training_frame=None,
              offset_column=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              **params):
        """
        Train the model synchronously (i.e. do not return until the model finishes training).

        To train asynchronously call :meth:`start`.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        """
        algo_params = locals()
        parms = self._parms.copy()
        parms.update({
            k: v
            for k, v in algo_params.items()
            if k not in ["self", "params", "algo_params", "parms"]
        })
        parms["search_criteria"] = self.search_criteria
        parms["hyper_parameters"] = self.hyper_params  # unique to grid search
        parms.update({
            k: v
            for k, v in list(self.model._parms.items()) if v is not None
        })  # unique to grid search
        parms.update(params)
        if '__class__' in parms:  # FIXME: hackt for PY3
            del parms['__class__']
        y = algo_params["y"]
        tframe = algo_params["training_frame"]
        if tframe is None: raise ValueError("Missing training_frame")
        if y is not None:
            if is_type(y, list, tuple):
                if len(y) == 1:
                    parms["y"] = y[0]
                else:
                    raise ValueError('y must be a single column reference')
            self._estimator_type = "classifier" if tframe[y].isfactor(
            ) else "regressor"
        self.build_model(parms)
Exemplo n.º 32
0
    def _log_message(self, msg):
        """
        Log the message `msg` to the destination `self._logging_dest`.

        If this destination is a file name, then we append the message to the file and then close the file
        immediately. If the destination is an open file handle, then we simply write the message there and do not
        attempt to close it.
        """
        if is_type(self._logging_dest, str):
            with open(self._logging_dest, "at", encoding="utf-8") as f:
                f.write(msg)
        else:
            self._logging_dest.write(msg)
Exemplo n.º 33
0
    def _log_message(self, msg):
        """
        Log the message `msg` to the destination `self._logging_dest`.

        If this destination is a file name, then we append the message to the file and then close the file
        immediately. If the destination is an open file handle, then we simply write the message there and do not
        attempt to close it.
        """
        if is_type(self._logging_dest, str):
            with open(self._logging_dest, "at", encoding="utf-8") as f:
                f.write(msg)
        else:
            self._logging_dest.write(msg)
Exemplo n.º 34
0
    def __init__(self, fr, by):
        """
        Return a new ``GroupBy`` object using the H2OFrame specified in fr and the desired grouping columns
        specified in by.  The original H2O frame will be stored as member _fr.  Information on the new grouping
        of the original frame is described in a new H2OFrame in member frame.

        The returned groups are sorted by the natural group-by column sort.

        :param H2OFrame fr: H2OFrame that you want the group by operation to be performed on.
        :param by: can be a column name (str) or an index (int) of a single column,  or a list for multiple columns
            denoting the set of columns to group by.
        """
        self._fr = fr  # IN
        self._by = by  # IN
        self._aggs = {}  # IN
        self._res = None  # OUT

        if is_type(by, str):
            self._by = [self._fr.names.index(by)]
        elif is_type(by, list, tuple):
            self._by = [self._fr.names.index(b) if is_type(b, str) else b for b in by]
        else:
            self._by = [self._by]
Exemplo n.º 35
0
 def __getitem__(self, item):
     if is_type(item, int, str):
         # single col selection returns list
         if is_type(item, int):
             index = item
             if index < 0: index += len(self._col_header)
             if index < 0 or index >= len(self._col_header):
                 raise H2OValueError("Index %d is out of range" % item)
         else:
             if item in self._col_header:
                 index = self._col_header.index(item)
             else:
                 raise H2OValueError("Column `%s` does not exist in the table" % item)
         return [row[index] for row in self._cell_values]
     elif isinstance(item, slice):
         # row selection if item is slice returns H2OTwoDimTable (slice works like pandas DateFrame, not like H2OFrame)
         new_table = copy.deepcopy(self)
         new_table._cell_values = [self._cell_values[ii] for ii in range(*item.indices(len(self._cell_values)))]
         return new_table
     elif is_type(item, [int, str]):
         # multiple col selection returns list of cols
         return [self[i] for i in item]
     else:
         raise TypeError('can not support getting item for ' + str(item))
Exemplo n.º 36
0
 def __validate_distribution(self, distribution):
     if is_type(distribution, str):
         distribution = distribution.lower()
         if distribution == "custom":
             raise H2OValueError(
                 'Distribution "custom" has to be specified as a '
                 'dictionary with their respective parameters, e.g., '
                 '`dict(type = \"custom\", custom_distribution_func = \"...\"))`.'
             )
         return distribution
     if is_type(distribution, dict):
         dist = distribution["type"].lower()
         allowed_distribution_parameters = dict(
             custom='custom_distribution_func',
             huber='huber_alpha',
             quantile='quantile_alpha',
             tweedie='tweedie_power')
         assert distribution.get(
             allowed_distribution_parameters.get(dist)
         ) is not None or len(distribution) == 1, (
             "Distribution dictionary should contain distribution and a distribution "
             "parameter. For example `dict(type=\"{}\", {}=...)`.").format(
                 dist, allowed_distribution_parameters[dist])
         if distribution[
                 "type"] == "custom" and "custom_distribution_func" not in distribution.keys(
                 ):
             raise H2OValueError(
                 'Distribution "custom" has to be specified as a '
                 'dictionary with their respective parameters, e.g., '
                 '`dict(type = \"custom\", custom_distribution_func = \"...\"))`.'
             )
         if allowed_distribution_parameters.get(
                 dist) in distribution.keys():
             setattr(self, "_" + allowed_distribution_parameters[dist],
                     distribution[allowed_distribution_parameters[dist]])
         return dist
Exemplo n.º 37
0
 def __getitem__(self, item):
     if is_type(item, int, str):
         # single col selection returns list
         if is_type(item, int):
             index = item
             if index < 0: index += len(self._col_header)
             if index < 0 or index >= len(self._col_header):
                 raise H2OValueError("Index %d is out of range" % item)
         else:
             if item in self._col_header:
                 index = self._col_header.index(item)
             else:
                 raise H2OValueError("Column `%s` does not exist in the table" % item)
         return [row[index] for row in self._cell_values]
     elif isinstance(item, slice):
         # row selection if item is slice returns H2OTwoDimTable
         # FIXME! slice behavior should be consistent with other selectors - return columns instead of rows...
         self._cell_values = [self._cell_values[ii] for ii in range(*item.indices(len(self._cell_values)))]
         return self
     elif is_type(item, [int, str]):
         # multiple col selection returns list of cols
         return [self[i] for i in item]
     else:
         raise TypeError('can not support getting item for ' + str(item))
Exemplo n.º 38
0
 def __init__(self, widgets, title, file_mode):
     super(ProgressBarWidget, self).__init__()
     self._file_mode = file_mode
     self._width = min(self._get_terminal_size(), 100)
     self._encoding = (sys.stdout.encoding or "").lower()
     wlist = []
     for widget in (widgets or [title + ":", PBWBar(), PBWPercentage()]):
         if is_type(widget, str):
             widget = PBWString(widget)
         widget.set_mode("file" if file_mode else "tty")
         widget.set_encoding(self._encoding)
         wlist.append(widget)
     self._to_render = None  # Render this string on the next rendering cycle. Rarely used.
     self._widgets = tuple(wlist)
     self._widget_lengths = self._compute_widget_sizes()
     self._rendered = ""
Exemplo n.º 39
0
 def __init__(self, widgets, title, file_mode):
     super(ProgressBarWidget, self).__init__()
     self._file_mode = file_mode
     self._width = min(self._get_terminal_size(), 100)
     self._encoding = (sys.stdout.encoding or "").lower()
     wlist = []
     for widget in (widgets or [title + ":", PBWBar(), PBWPercentage()]):
         if is_type(widget, str):
             widget = PBWString(widget)
         widget.set_mode("file" if file_mode else "tty")
         widget.set_encoding(self._encoding)
         wlist.append(widget)
     self._to_render = None  # Render this string on the next rendering cycle. Rarely used.
     self._widgets = tuple(wlist)
     self._widget_lengths = self._compute_widget_sizes()
     self._rendered = ""
Exemplo n.º 40
0
 def __init__(self, model, hyper_params, grid_id=None, search_criteria=None):
     super(H2OGridSearch, self).__init__()
     assert_is_type(model, None, H2OEstimator, lambda mdl: issubclass(mdl, H2OEstimator))
     assert_is_type(hyper_params, dict)
     assert_is_type(grid_id, None, str)
     assert_is_type(search_criteria, None, dict)
     if not (model is None or is_type(model, H2OEstimator)): model = model()
     self._id = grid_id
     self.model = model
     self.hyper_params = dict(hyper_params)
     self.search_criteria = None if search_criteria is None else dict(search_criteria)
     self._grid_json = None
     self.models = None  # list of H2O Estimator instances
     self._parms = {}  # internal, for object recycle #
     self.parms = {}  # external#
     self._future = False  # used by __repr__/show to query job state#
     self._job = None  # used when _future is True#
Exemplo n.º 41
0
 def __init__(self, model, hyper_params, grid_id=None, search_criteria=None):
     super(H2OGridSearch, self).__init__()
     assert_is_type(model, None, H2OEstimator, lambda mdl: issubclass(mdl, H2OEstimator))
     assert_is_type(hyper_params, dict)
     assert_is_type(grid_id, None, str)
     assert_is_type(search_criteria, None, dict)
     if not (model is None or is_type(model, H2OEstimator)): model = model()
     self._id = grid_id
     self.model = model
     self.hyper_params = dict(hyper_params)
     self.search_criteria = None if search_criteria is None else dict(search_criteria)
     self._grid_json = None
     self.models = None  # list of H2O Estimator instances
     self._parms = {}  # internal, for object recycle #
     self.parms = {}  # external#
     self._future = False  # used by __repr__/show to query job state#
     self._job = None  # used when _future is True#
Exemplo n.º 42
0
def manual_partial_dependence(model, datafile, xlist, xname, weightV):
    dataframe = h2o.import_file(pyunit_utils.locate(datafile))
    meanV = []
    stdV = []
    stderrV = []
    nRows = dataframe.nrow
    nCols = dataframe.ncol - 1

    for xval in xlist:
        cons = [xval] * nRows
        if xname in dataframe.names:
            dataframe = dataframe.drop(xname)
        if not ((is_type(xval, str) and xval == 'NA') or
                (isinstance(xval, float) and math.isnan(xval))):
            dataframe = dataframe.cbind(h2o.H2OFrame(cons))
            dataframe.set_name(nCols, xname)

        pred = model.predict(dataframe).as_data_frame(use_pandas=False,
                                                      header=False)
        pIndex = len(pred[0]) - 1
        sumEle = 0.0
        sumEleSq = 0.0
        sumWeight = 0.0
        numNonZeroWeightCount = 0.0
        m = 1.0 / math.sqrt(dataframe.nrow * 1.0)
        for rindex in range(len(pred)):
            val = float(pred[rindex][pIndex])
            weight = weightV[rindex]
            if (abs(weight) > 0) and isinstance(
                    val, float) and not (math.isnan(val)):
                temp = val * weight
                sumEle = sumEle + temp
                sumEleSq = sumEleSq + temp * val
                sumWeight = sumWeight + weight
                numNonZeroWeightCount = numNonZeroWeightCount + 1
        wMean = sumEle / sumWeight
        scale = numNonZeroWeightCount * 1.0 / (numNonZeroWeightCount - 1)
        wSTD = math.sqrt((sumEleSq / sumWeight - wMean * wMean) * scale)
        meanV.append(wMean)
        stdV.append(wSTD)
        stderrV.append(wSTD * m)

    return meanV, stdV, stderrV
Exemplo n.º 43
0
def _handle_python_dicts(python_obj, check_header):
    header = list(python_obj.keys())
    is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header)  # is this a valid header?
    if not is_valid:
        raise ValueError(
            "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ")
    for k in python_obj:  # check that each value entry is a flat list/tuple or single int, float, or string
        v = python_obj[k]
        if isinstance(v, (tuple, list)):  # if value is a tuple/list, then it must be flat
            if _is_list_of_lists(v):
                raise ValueError("Values in the dictionary must be flattened!")
        elif is_type(v, str, numeric):
            python_obj[k] = [v]
        else:
            raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v))

    zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip
    rows = list(map(list, zipper(*list(python_obj.values()))))
    data_to_write = [dict(list(zip(header, row))) for row in rows]
    return header, data_to_write
Exemplo n.º 44
0
def manual_partial_dependence(model, datafile, xlist, xname, weightV):
    dataframe = h2o.import_file(pyunit_utils.locate(datafile))
    meanV = []
    stdV = []
    stderrV = []
    nRows = dataframe.nrow
    nCols = dataframe.ncol-1

    for xval in xlist:
        cons = [xval]*nRows
        if xname in dataframe.names:
            dataframe=dataframe.drop(xname)
        if not((is_type(xval, str) and xval=='NA') or (isinstance(xval, float) and math.isnan(xval))):
            dataframe = dataframe.cbind(h2o.H2OFrame(cons))
            dataframe.set_name(nCols, xname)

        pred = model.predict(dataframe).as_data_frame(use_pandas=False, header=False)
        pIndex = len(pred[0])-1
        sumEle = 0.0
        sumEleSq = 0.0
        sumWeight = 0.0
        numNonZeroWeightCount = 0.0
        m = 1.0/math.sqrt(dataframe.nrow*1.0)
        for rindex in range(len(pred)):
            val = float(pred[rindex][pIndex]);
            weight = weightV[rindex]
            if (abs(weight) > 0) and isinstance(val, float) and not(math.isnan(val)):
                temp = val*weight
                sumEle = sumEle+temp
                sumEleSq = sumEleSq+temp*val
                sumWeight = sumWeight+weight
                numNonZeroWeightCount = numNonZeroWeightCount+1
        wMean = sumEle/sumWeight
        scale = numNonZeroWeightCount*1.0/(numNonZeroWeightCount-1)
        wSTD = math.sqrt((sumEleSq/sumWeight-wMean*wMean)*scale)
        meanV.append(wMean)
        stdV.append(wSTD)
        stderrV.append(wSTD*m)

    return meanV, stdV, stderrV
Exemplo n.º 45
0
 def __new__(cls, keyvals):
     # This method is called by the simplejson.json(object_pairs_hook=<this>)
     # `keyvals` is a list of (key,value) tuples. For example:
     #    [("schema_version", 3), ("schema_name", "InitIDV3"), ("schema_type", "Iced")]
     schema = None
     for k, v in keyvals:
         if k == "__meta" and isinstance(v, dict):
             schema = v["schema_name"]
             break
         if k == "__schema" and is_type(v, str):
             schema = v
             break
     if schema == "CloudV3": return H2OCluster.from_kvs(keyvals)
     if schema == "H2OErrorV3": return H2OErrorV3(keyvals)
     if schema == "H2OModelBuilderErrorV3": return H2OModelBuilderErrorV3(keyvals)
     if schema == "TwoDimTableV3": return H2OTwoDimTable.make(keyvals)
     if schema == "ModelMetricsRegressionV3": return H2ORegressionModelMetrics.make(keyvals)
     if schema == "ModelMetricsClusteringV3": return H2OClusteringModelMetrics.make(keyvals)
     if schema == "ModelMetricsBinomialV3": return H2OBinomialModelMetrics.make(keyvals)
     if schema == "ModelMetricsMultinomialV3": return H2OMultinomialModelMetrics.make(keyvals)
     if schema == "ModelMetricsAutoEncoderV3": return H2OAutoEncoderModelMetrics.make(keyvals)
     return super(H2OResponse, cls).__new__(cls, keyvals)
Exemplo n.º 46
0
 def train(self,
           x,
           y=None,
           training_frame=None,
           offset_column=None,
           fold_column=None,
           weights_column=None,
           validation_frame=None,
           **params):
     # same api as estimator_base train
     algo_params = locals()
     parms = self._parms.copy()
     parms.update({
         k: v
         for k, v in algo_params.items()
         if k not in ["self", "params", "algo_params", "parms"]
     })
     parms["search_criteria"] = self.search_criteria
     parms["hyper_parameters"] = self.hyper_params  # unique to grid search
     parms.update({
         k: v
         for k, v in list(self.model._parms.items()) if v is not None
     })  # unique to grid search
     parms.update(params)
     if '__class__' in parms:  # FIXME: hackt for PY3
         del parms['__class__']
     y = algo_params["y"]
     tframe = algo_params["training_frame"]
     if tframe is None: raise ValueError("Missing training_frame")
     if y is not None:
         if is_type(y, list, tuple):
             if len(y) == 1:
                 parms["y"] = y[0]
             else:
                 raise ValueError('y must be a single column reference')
         self._estimator_type = "classifier" if tframe[y].isfactor(
         ) else "regressor"
     self.build_model(parms)
Exemplo n.º 47
0
 def train(self, x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
           validation_frame=None, **params):
     # same api as estimator_base train
     algo_params = locals()
     parms = self._parms.copy()
     parms.update({k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"]})
     parms["search_criteria"] = self.search_criteria
     parms["hyper_parameters"] = self.hyper_params  # unique to grid search
     parms.update({k: v for k, v in list(self.model._parms.items()) if v is not None})  # unique to grid search
     parms.update(params)
     if '__class__' in parms:  # FIXME: hackt for PY3
         del parms['__class__']
     y = algo_params["y"]
     tframe = algo_params["training_frame"]
     if tframe is None: raise ValueError("Missing training_frame")
     if y is not None:
         if is_type(y, list, tuple):
             if len(y) == 1:
                 parms["y"] = y[0]
             else:
                 raise ValueError('y must be a single column reference')
         self._estimator_type = "classifier" if tframe[y].isfactor() else "regressor"
     self.build_model(parms)
Exemplo n.º 48
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None):
        """
        Train the H2O model.

        Parameters
        ----------
        x : list, None
            A list of column names or indices indicating the predictor columns.

        y :
            An index or a column name indicating the response column.

        training_frame : H2OFrame
            The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).

        offset_column : str, optional
            The name or index of the column in training_frame that holds the offsets.

        fold_column : str, optional
            The name or index of the column in training_frame that holds the per-row fold
            assignments.

        weights_column : str, optional
            The name or index of the column in training_frame that holds the per-row weights.

        validation_frame : H2OFrame, optional
            H2OFrame with validation data to be scored on while training.

        max_runtime_secs : float
            Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        algo = self.algo
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"})
        ncols = training_frame.ncols
        names = training_frame.names
        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        elif y is not None:
            raise H2OValueError("y should not be provided for an unsupervised model")
        assert_is_type(y, str, None)
        ignored_columns_set = set()
        if ignored_columns is not None:
            if x is not None:
                raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
            for ic in ignored_columns:
                if is_type(ic, int):
                    if not (-ncols <= ic < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % ic)
                    ignored_columns_set.add(names[ic])
                else:
                    if ic not in names:
                        raise H2OValueError("Column %s not in the training frame" % ic)
                    ignored_columns_set.add(ic)
        if x is None:
            xset = set(names) - {y} - ignored_columns_set
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)

        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        parms["training_frame"] = training_frame
        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        offset = parms["offset_column"]
        folds = parms["fold_column"]
        weights = parms["weights_column"]
        ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
        parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Exemplo n.º 49
0
def _is_num_list(l):
    return is_type(l, [numeric])
Exemplo n.º 50
0
def _is_str_list(l):
    return is_type(l, [str])
Exemplo n.º 51
0
    def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None,
              ice_root=None, log_dir=None, log_level=None, port="54321+", name=None, extra_classpath=None,
              verbose=True, jvm_custom_args=None, bind_to_localhost=True):
        """
        Start new H2O server on the local machine.

        :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the
            locations returned by `._jar_paths()`.
        :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used.
            -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly.
        :param enable_assertions: If True, pass `-ea` option to the JVM.
        :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes.
        :param min_mem_size: Minimum heap size (jvm option Xms), in bytes.
        :param log_dir: Directory for H2O logs to be stored if a new instance is started. Default directory is determined
        by H2O internally.
        :param log_level: The logger level for H2O if a new instance is started.
        :param ice_root: A directory where H2O stores its temporary files. Default location is determined by
            tempfile.mkdtemp().
        :param port: Port where to start the new server. This could be either an integer, or a string of the form
            "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up.
        :param name: name of the h2o cluster to be started
        :param extra_classpath List of paths to libraries that should be included on the Java classpath.
        :param verbose: If True, then connection info will be printed to the stdout.
        :param jvm_custom_args Custom, user-defined arguments for the JVM H2O is instantiated in
        :param bind_to_localhost A flag indicating whether access to the H2O instance should be restricted to the local
            machine (default) or if it can be reached from other computers on the network.
            Only applicable when H2O is started from the Python client.

        :returns: a new H2OLocalServer instance
        """
        assert_is_type(jar_path, None, str)
        assert_is_type(port, None, int, str)
        assert_is_type(name, None, str)
        assert_is_type(nthreads, -1, BoundInt(1, 4096))
        assert_is_type(enable_assertions, bool)
        assert_is_type(min_mem_size, None, int)
        assert_is_type(max_mem_size, None, BoundInt(1 << 25))
        assert_is_type(log_dir, str, None)
        assert_is_type(log_level, str, None)
        assert_satisfies(log_level, log_level in [None, "TRACE", "DEBUG", "INFO", "WARN", "ERRR", "FATA"])
        assert_is_type(ice_root, None, I(str, os.path.isdir))
        assert_is_type(extra_classpath, None, [str])
        assert_is_type(jvm_custom_args, list, None)
        assert_is_type(bind_to_localhost, bool)
        if jar_path:
            assert_satisfies(jar_path, jar_path.endswith("h2o.jar"))

        if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size:
            raise H2OValueError("`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size))
        if port is None: port = "54321+"
        baseport = None
        # TODO: get rid of this port gimmick and have 2 separate parameters.
        if is_type(port, str):
            if port.isdigit():
                port = int(port)
            else:
                if not(port[-1] == "+" and port[:-1].isdigit()):
                    raise H2OValueError("`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port)
                baseport = int(port[:-1])
                port = 0

        hs = H2OLocalServer()
        hs._verbose = bool(verbose)
        hs._jar_path = hs._find_jar(jar_path)
        hs._extra_classpath = extra_classpath
        hs._ice_root = ice_root
        hs._name = name
        if not ice_root:
            hs._ice_root = tempfile.mkdtemp()
            hs._tempdir = hs._ice_root

        if verbose: print("Attempting to start a local H2O server...")
        hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions,
                          mmax=max_mem_size, mmin=min_mem_size, jvm_custom_args=jvm_custom_args,
                          bind_to_localhost=bind_to_localhost, log_dir=log_dir, log_level=log_level)
        if verbose: print("  Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port))
        atexit.register(lambda: hs.shutdown())
        return hs
Exemplo n.º 52
0
    def train(self, x = None, y = None, training_frame = None, fold_column = None, 
              weights_column = None, validation_frame = None, leaderboard_frame = None, blending_frame = None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        ncols = training_frame.ncols
        names = training_frame.names

        #Set project name if None
        if self.project_name is None:
            self.project_name = "automl_" + training_frame.frame_id
            self.build_control["project_name"] = self.project_name

        # Minimal required arguments are training_frame and y (response)
        if y is None:
            raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        if training_frame is None:
            raise ValueError('The training frame is not set!')
        else:
            assert_is_type(training_frame, H2OFrame)
            input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            assert_is_type(validation_frame, H2OFrame)
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            assert_is_type(leaderboard_frame, H2OFrame)
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if blending_frame is not None:
            assert_is_type(blending_frame, H2OFrame)
            input_spec['blending_frame'] = blending_frame.frame_id

        if self.sort_metric is not None:
            assert_is_type(self.sort_metric, str)
            sort_metric = self.sort_metric.lower()
            # Changed the API to use "deviance" to be consistent with stopping_metric values
            # TO DO: let's change the backend to use "deviance" since we use the term "deviance"
            # After that we can take this `if` statement out
            if sort_metric == "deviance":
                sort_metric = "mean_residual_deviance"
            input_spec['sort_metric'] = sort_metric

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None and fold_column in ignored_columns:
                ignored_columns.remove(fold_column)
            if weights_column is not None and weights_column in ignored_columns:
                ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec = input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control
        automl_build_params['build_models']  = self.build_models

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        self._job = H2OJob(resp['job'], "AutoML")
        self._job.poll()
        self._fetch()
Exemplo n.º 53
0
    def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False, extend_parms_fn=None):
        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)
        assert_is_type(extend_parms_fn, None, FunctionType)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given." % self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is None and "ignored_columns" in parms:
                ignored_columns = parms['ignored_columns']
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)
            self._check_and_save_parm(parms, "offset_column", offset_column)
            self._check_and_save_parm(parms, "weights_column", weights_column)
            self._check_and_save_parm(parms, "fold_column", fold_column)

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
    
        # internal hook allowing subclasses to extend train parms 
        if extend_parms_fn is not None:
            extend_parms_fn(parms)
            
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Exemplo n.º 54
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        :param bool verbose: Print scoring history to stdout. Defaults to False.
        """

        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)

            parms["offset_column"] = offset_column
            parms["fold_column"] = fold_column
            parms["weights_column"] = weights_column
            parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])

        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Exemplo n.º 55
0
    def open(server=None, url=None, ip=None, port=None, https=None, auth=None, verify_ssl_certificates=True,
             proxy=None, cookies=None, verbose=True, _msgs=None):
        r"""
        Establish connection to an existing H2O server.

        The connection is not kept alive, so what this method actually does is it attempts to connect to the
        specified server, and checks that the server is healthy and responds to REST API requests. If the H2O server
        cannot be reached, an :class:`H2OConnectionError` will be raised. On success this method returns a new
        :class:`H2OConnection` object, and it is the only "official" way to create instances of this class.

        There are 3 ways to specify the target to connect to (these settings are mutually exclusive):

            * pass a ``server`` option,
            * pass the full ``url`` for the connection,
            * provide a triple of parameters ``ip``, ``port``, ``https``.

        :param H2OLocalServer server: connect to the specified local server instance. There is a slight difference
            between connecting to a local server by specifying its ip and address, and connecting through
            an H2OLocalServer instance: if the server becomes unresponsive, then having access to its process handle
            will allow us to query the server status through OS, and potentially provide snapshot of the server's
            error log in the exception information.
        :param url: full url of the server to connect to.
        :param ip: target server's IP address or hostname (default "localhost").
        :param port: H2O server's port (default 54321).
        :param https: if True then connect using https instead of http (default False).
        :param verify_ssl_certificates: if False then SSL certificate checking will be disabled (default True). This
            setting should rarely be disabled, as it makes your connection vulnerable to man-in-the-middle attacks. When
            used, it will generate a warning from the requests library. Has no effect when ``https`` is False.
        :param auth: authentication token for connecting to the remote server. This can be either a
            (username, password) tuple, or an authenticator (AuthBase) object. Please refer to the documentation in
            the ``requests.auth`` module.
        :param proxy: url address of a proxy server. If you do not specify the proxy, then the requests module
            will attempt to use a proxy specified in the environment (in HTTP_PROXY / HTTPS_PROXY variables). We
            check for the presence of these variables and issue a warning if they are found. In order to suppress
            that warning and use proxy from the environment, pass ``proxy="(default)"``.
        :param cookies: Cookie (or list of) to add to requests
        :param verbose: if True, then connection progress info will be printed to the stdout.
        :param _msgs: custom messages to display during connection. This is a tuple (initial message, success message,
            failure message).

        :returns: A new :class:`H2OConnection` instance.
        :raises H2OConnectionError: if the server cannot be reached.
        :raises H2OServerError: if the server is in an unhealthy state (although this might be a recoverable error, the
            client itself should decide whether it wants to retry or not).
        """
        if server is not None:
            assert_is_type(server, H2OLocalServer)
            assert_is_type(ip, None, "`ip` should be None when `server` parameter is supplied")
            assert_is_type(url, None, "`ip` should be None when `server` parameter is supplied")
            if not server.is_running():
                raise H2OConnectionError("Unable to connect to server because it is not running")
            ip = server.ip
            port = server.port
            scheme = server.scheme
            context_path = ''
        elif url is not None:
            assert_is_type(url, str)
            assert_is_type(ip, None, "`ip` should be None when `url` parameter is supplied")
            # We don't allow any Unicode characters in the URL. Maybe some day we will...
            match = assert_matches(url, H2OConnection.url_pattern)
            scheme = match.group(1)
            ip = match.group(2)
            port = int(match.group(3))
            context_path = '' if match.group(4) is None else "%s" % (match.group(4))
        else:
            if ip is None: ip = str("localhost")
            if port is None: port = 54321
            if https is None: https = False
            if is_type(port, str) and port.isdigit(): port = int(port)
            assert_is_type(ip, str)
            assert_is_type(port, int)
            assert_is_type(https, bool)
            assert_matches(ip, r"(?:[\w-]+\.)*[\w-]+")
            assert_satisfies(port, 1 <= port <= 65535)
            scheme = "https" if https else "http"
            context_path = ''

        if verify_ssl_certificates is None: verify_ssl_certificates = True
        assert_is_type(verify_ssl_certificates, bool)
        assert_is_type(proxy, str, None)
        assert_is_type(auth, AuthBase, (str, str), None)
        assert_is_type(cookies, str, [str], None)
        assert_is_type(_msgs, None, (str, str, str))

        conn = H2OConnection()
        conn._verbose = bool(verbose)
        conn._local_server = server
        conn._base_url = "%s://%s:%d%s" % (scheme, ip, port, context_path)
        conn._verify_ssl_cert = bool(verify_ssl_certificates)
        conn._auth = auth
        conn._cookies = cookies
        conn._proxies = None
        if proxy and proxy != "(default)":
            conn._proxies = {scheme: proxy}
        elif not proxy:
            # Give user a warning if there are any "*_proxy" variables in the environment. [PUBDEV-2504]
            # To suppress the warning pass proxy = "(default)".
            for name in os.environ:
                if name.lower() == scheme + "_proxy":
                    warn("Proxy is defined in the environment: %s. "
                         "This may interfere with your H2O Connection." % name)

        try:
            retries = 20 if server else 5
            conn._stage = 1
            conn._timeout = 3.0
            conn._cluster = conn._test_connection(retries, messages=_msgs)
            # If a server is unable to respond within 1s, it should be considered a bug. However we disable this
            # setting for now, for no good reason other than to ignore all those bugs :(
            conn._timeout = None
            # This is a good one! On the surface it registers a callback to be invoked when the script is about
            # to finish, but it also has a side effect in that the reference to current connection will be held
            # by the ``atexit`` service till the end -- which means it will never be garbage-collected.
            atexit.register(lambda: conn.close())
        except Exception:
            # Reset _session_id so that we know the connection was not initialized properly.
            conn._stage = 0
            raise
        return conn
Exemplo n.º 56
0
    def train(self, x = None, y = None, training_frame = None, fold_column = None, 
              weights_column = None, validation_frame = None, leaderboard_frame = None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data to be scored on while training. Optional. 
            This frame is used early stopping of individual models and early stopping of the grid searches 
            (unless max_models or max_runtime_secs overrides metric-based early stopping).
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.  

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        ncols = training_frame.ncols
        names = training_frame.names

        # Minimal required arguments are training_frame and y (response)
        if y is None:
            raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        if training_frame is None:
            raise ValueError('The training frame is not set!')
        else:
            assert_is_type(training_frame, H2OFrame)
            input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None: ignored_columns = ignored_columns.remove(fold_column)
            if weights_column is not None: ignored_columns = ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec = input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control
        automl_build_params['build_models']  = self.build_models

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        self._job = H2OJob(resp['job'], "AutoML")
        self._automl_key = self._job.dest_key
        self._job.poll()
        self._fetch()