示例#1
0
    def get_hyperparams(self, id, display=True):
        """
        Get the hyperparameters of a model explored by grid search.

        Parameters
        ----------
        id: str
          The model id of the model with hyperparameters of interest.
        display: boolean
          Flag to indicate whether to display the hyperparameter names.

        Returns
        -------
          A list of the hyperparameters for the specified model.
        """
        idx = id if is_int(id) else self.model_ids.index(id)
        model = self[idx]

        # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        res = [
            model.params[h]['actual'][0] if isinstance(
                model.params[h]['actual'], list) else model.params[h]['actual']
            for h in self.hyper_params
        ]
        if display:
            print('Hyperparameters: [' +
                  ', '.join(list(self.hyper_params.keys())) + ']')
        return res
示例#2
0
    def get_hyperparams_dict(self, id, display=True):
        """
        Derived and returned the model parameters used to train the particular grid search model.

        Parameters
        ----------
        id: str
          The model id of the model with hyperparameters of interest.
        display: boolean
          Flag to indicate whether to display the hyperparameter names.

        Returns
        -------
          A dict of model pararmeters derived from the hyper-parameters used to train this particular model.
        """
        idx = id if is_int(id) else self.model_ids.index(id)
        model = self[idx]

        model_params = dict()

        # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        for param_name in self.hyper_names:
            model_params[param_name] = model.params[param_name]['actual'][0] if \
                isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual']

        if display:
            print('Hyperparameters: [' +
                  ', '.join(list(self.hyper_params.keys())) + ']')
        return model_params
示例#3
0
def interaction(data,
                factors,
                pairwise,
                max_factors,
                min_occurrence,
                destination_frame=None):
    """
    Categorical Interaction Feature Creation in H2O.
    Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by
    the user.

    Parameters
    ----------
      data : H2OFrame
        the H2OFrame that holds the target categorical columns.

      factors : list
        factors Factor columns (either indices or column names).

      pairwise : bool
        Whether to create pairwise interactions between factors (otherwise create one
        higher-order interaction). Only applicable if there are 3 or more factors.

      max_factors : int
        Max. number of factor levels in pair-wise interaction terms (if enforced, one extra
        catch-all factor will be made)

      min_occurrence : int
        Min. occurrence threshold for factor levels in pair-wise interaction terms

      destination_frame : str
        A string indicating the destination key. If empty, this will be auto-generated by H2O.

    Returns
    -------
      H2OFrame
    """
    factors = [data.names[n] if is_int(n) else n for n in factors]
    parms = {
        "dest":
        py_tmp_key(append=h2oconn.session_id)
        if destination_frame is None else destination_frame,
        "source_frame":
        data.frame_id,
        "factor_columns": [quoted(f) for f in factors],
        "pairwise":
        pairwise,
        "max_factors":
        max_factors,
        "min_occurrence":
        min_occurrence,
    }
    H2OJob(api("POST /3/Interaction", data=parms), "Interactions").poll()
    return get_frame(parms["dest"])
示例#4
0
    def _get_type_name(types):
        """
        Return the name of the provided type.

            >>> _get_type_name([int]) == "integer"
            >>> _get_type_name([str]) == "string"
            >>> _get_type_name([tuple]) == "tuple"
            >>> _get_type_name([Exception]) == "Exception"
            >>> _get_type_name((int, float, bool)) == "integer|float|bool"
            >>> _get_type_name((H2OFrame, None)) == "?H2OFrame"
        """
        from h2o.utils.typechecks import is_str, is_int, U, I, numeric
        maybe_type = False
        res = []
        for tt in types:
            if tt is None:
                maybe_type = True
            elif tt is str:
                res.append("string")
            elif tt is int:
                res.append("integer")
            elif tt is numeric:
                res.append("numeric")
            elif is_str(tt):
                res.append('"%s"' % repr(tt)[1:-1])
            elif is_int(tt):
                res.append(str(tt))
            elif isinstance(tt, U):
                res.append(H2OTypeError._get_type_name(tt))
            elif isinstance(tt, I):
                res.append("&".join(
                    H2OTypeError._get_type_name([tttt]) for tttt in tt))
            elif isinstance(tt, type):
                res.append(tt.__name__)
            elif isinstance(tt, list):
                res.append("list(%s)" % H2OTypeError._get_type_name(tt))
            elif isinstance(tt, set):
                res.append("set(%s)" % H2OTypeError._get_type_name(tt))
            elif isinstance(tt, tuple):
                res.append("(%s)" % ", ".join(
                    H2OTypeError._get_type_name([item]) for item in tt))
            elif isinstance(tt, dict):
                res.append("dict(%s)" %
                           ", ".join("%s: %s" %
                                     (H2OTypeError._get_type_name([tk]),
                                      H2OTypeError._get_type_name([tv]))
                                     for tk, tv in tt.items()))
            else:
                raise RuntimeError("Unexpected `tt`: %r" % tt)
        if maybe_type:
            if not res: return "None"
            res[0] = "?" + res[0]
        return "|".join(res)
示例#5
0
    def _get_type_name(types):
        """
        Return the name of the provided type.

            >>> _get_type_name([int]) == "integer"
            >>> _get_type_name([str]) == "string"
            >>> _get_type_name([tuple]) == "tuple"
            >>> _get_type_name([Exception]) == "Exception"
            >>> _get_type_name((int, float, bool)) == "integer|float|bool"
            >>> _get_type_name((H2OFrame, None)) == "?H2OFrame"
        """
        from h2o.utils.typechecks import is_str, is_int, U, I, numeric
        maybe_type = False
        res = []
        for tt in types:
            if tt is None:
                maybe_type = True
            elif tt is str:
                res.append("string")
            elif tt is int:
                res.append("integer")
            elif tt is numeric:
                res.append("numeric")
            elif is_str(tt):
                res.append('"%s"' % repr(tt)[1:-1])
            elif is_int(tt):
                res.append(str(tt))
            elif isinstance(tt, U):
                res.append(H2OTypeError._get_type_name(tt))
            elif isinstance(tt, I):
                res.append("&".join(H2OTypeError._get_type_name([tttt]) for tttt in tt))
            elif isinstance(tt, type):
                res.append(tt.__name__)
            elif isinstance(tt, list):
                res.append("list(%s)" % H2OTypeError._get_type_name(tt))
            elif isinstance(tt, set):
                res.append("set(%s)" % H2OTypeError._get_type_name(tt))
            elif isinstance(tt, tuple):
                res.append("(%s)" % ", ".join(H2OTypeError._get_type_name([item]) for item in tt))
            elif isinstance(tt, dict):
                res.append("dict(%s)" % ", ".join(
                    "%s: %s" % (H2OTypeError._get_type_name([tk]), H2OTypeError._get_type_name([tv]))
                    for tk, tv in tt.items()
                ))
            else:
                raise RuntimeError("Unexpected `tt`: %r" % tt)
        if maybe_type:
            if not res: return "None"
            res[0] = "?" + res[0]
        return "|".join(res)
示例#6
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_int(y): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_int(x[0]):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            quoted(col) for col in ignored_columns
        ]
        kwargs["interactions"] = None if (
            "interactions" not in kwargs or kwargs["interactions"] is None
        ) else [quoted(col) for col in kwargs["interactions"]]
        kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k]))
                       for k in kwargs])  # gruesome one-liner
        rest_ver = kwargs.pop(
            "_rest_version") if "_rest_version" in kwargs else 3
        algo = self._compute_algo()

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, algo),
                               data=kwargs),
                       job_type=(algo + " Model Build"))

        if self._future:
            self._job = model
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" %
                             (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
示例#7
0
 def _add_agg(self, op, col, na):
     if op == "nrow": col = 0
     if col is None:
         for i in range(self._fr.ncol):
             if i not in self._by: self._add_agg(op, i, na)
         return self
     elif is_str(col):
         cidx = self._fr.names.index(col)
     elif is_int(col):
         cidx = col
     elif is_listlike(col):
         for i in col:
             self._add_agg(op, i, na)
         return self
     else:
         raise ValueError("col must be a column name or index.")
     name = "{}_{}".format(op, self._fr.names[cidx])
     self._aggs[name] = [op, cidx, na]
     return self
示例#8
0
文件: h2o.py 项目: Ansonparkour/h2o-3
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None):
    """
    Categorical Interaction Feature Creation in H2O.
    Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by
    the user.

    Parameters
    ----------
      data : H2OFrame
        the H2OFrame that holds the target categorical columns.

      factors : list
        factors Factor columns (either indices or column names).

      pairwise : bool
        Whether to create pairwise interactions between factors (otherwise create one
        higher-order interaction). Only applicable if there are 3 or more factors.

      max_factors : int
        Max. number of factor levels in pair-wise interaction terms (if enforced, one extra
        catch-all factor will be made)

      min_occurrence : int
        Min. occurrence threshold for factor levels in pair-wise interaction terms

      destination_frame : str
        A string indicating the destination key. If empty, this will be auto-generated by H2O.

    Returns
    -------
      H2OFrame
    """
    factors = [data.names[n] if is_int(n) else n for n in factors]
    parms = {"dest": py_tmp_key(append=h2oconn.session_id) if destination_frame is None else destination_frame,
             "source_frame": data.frame_id,
             "factor_columns": [quoted(f) for f in factors],
             "pairwise": pairwise,
             "max_factors": max_factors,
             "min_occurrence": min_occurrence,
             }
    H2OJob(api("POST /3/Interaction", data=parms), "Interactions").poll()
    return get_frame(parms["dest"])
示例#9
0
    def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None,
              ice_root=None, port="54321+", verbose=True):
        """
        Start new H2O server on the local machine.

        :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the
            locations returned by `._jar_paths()`.
        :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used.
            -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly.
        :param enable_assertions: If True, pass `-ea` option to the JVM.
        :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes.
        :param min_mem_size: Minimum heap size (jvm option Xms), in bytes.
        :param ice_root: A directory where H2O stores its temporary files. Default location is determined by
            tempfile.mkdtemp().
        :param port: Port where to start the new server. This could be either an integer, or a string of the form
            "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up.
        :param verbose: If True, then connection info will be printed to the stdout.

        :returns: a new H2OLocalServer instance
        """
        assert jar_path is None or is_str(jar_path), "`jar_path` should be string, got %s" % type(jar_path)
        assert jar_path is None or jar_path.endswith("h2o.jar"), \
            "`jar_path` should be a path to an h2o.jar executable, got %s" % jar_path
        assert is_int(nthreads), "`nthreads` should be integer, got %s" % type(nthreads)
        assert nthreads == -1 or 1 <= nthreads <= 4096, "`nthreads` is out of bounds: %d" % nthreads
        assert isinstance(enable_assertions, bool), \
            "`enable_assertions` should be bool, got %s" % type(enable_assertions)
        assert max_mem_size is None or is_int(max_mem_size), \
            "`max_mem_size` should be integer, got %s" % type(max_mem_size)
        assert max_mem_size is None or max_mem_size >= 1 << 25, "`max_mem_size` too small: %d" % max_mem_size
        assert min_mem_size is None or is_int(min_mem_size), \
            "`min_mem_size` should be integer, got %s" % type(min_mem_size)
        assert min_mem_size is None or max_mem_size is None or min_mem_size <= max_mem_size, \
            "`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size)
        if ice_root:
            assert is_str(ice_root), "`ice_root` should be string, got %r" % type(ice_root)
            assert os.path.isdir(ice_root), "`ice_root` is not a valid directory: %s" % ice_root
        if port is None: port = "54321+"
        baseport = None
        if is_str(port):
            if port.isdigit():
                port = int(port)
            else:
                assert port[-1] == "+" and port[:-1].isdigit(), \
                    "`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port
                baseport = int(port[:-1])
                port = 0
        assert is_int(port), "`port` should be integer (or string). Got: %s" % type(port)

        hs = H2OLocalServer()
        hs._verbose = bool(verbose)
        hs._jar_path = hs._find_jar(jar_path)
        hs._ice_root = ice_root
        if not ice_root:
            hs._ice_root = tempfile.mkdtemp()
            hs._tempdir = hs._ice_root

        if verbose: print("Attempting to start a local H2O server...")
        hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions,
                          mmax=max_mem_size, mmin=min_mem_size)
        if verbose: print("Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port))
        atexit.register(lambda: hs.shutdown())
        return hs
示例#10
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_int(y): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_listlike(x): x = [x]
        if is_int(x[0]):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [
            quoted(col) for col in ignored_columns
        ]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(
            kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs
                       if kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop(
            "_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs),
                      job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()
        if rest_ver is not None:
            grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))

            error_index = 0
            if len(grid_json["failure_details"]) > 0:
                print("Errors/Warnings building gridsearch model\n")

                for error_message in grid_json["failure_details"]:
                    if isinstance(grid_json["failed_params"][error_index],
                                  dict):
                        for h_name in grid_json['hyper_names']:
                            print("Hyper-parameter: {0}, {1}".format(
                                h_name, grid_json['failed_params'][error_index]
                                [h_name]))

                    if len(grid_json["failure_stack_traces"]) > error_index:
                        print("failure_details: {0}\nfailure_stack_traces: "
                              "{1}\n".format(
                                  error_message,
                                  grid_json['failure_stack_traces']
                                  [error_index]))
                    error_index += 1
        else:
            grid_json = h2o.api("GET /99/Grids/%s" % grid.dest_key)

        self.models = [
            h2o.get_model(key['name']) for key in grid_json['model_ids']
        ]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api(
                "GET /%d/Models/%s" %
                (rest_ver
                 or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            raise ValueError(
                "Gridsearch returns no model due to bad parameter values or other reasons...."
            )
示例#11
0
    def start(jar_path=None,
              nthreads=-1,
              enable_assertions=True,
              max_mem_size=None,
              min_mem_size=None,
              ice_root=None,
              port="54321+",
              verbose=True):
        """
        Start new H2O server on the local machine.

        :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the
            locations returned by `._jar_paths()`.
        :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used.
            -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly.
        :param enable_assertions: If True, pass `-ea` option to the JVM.
        :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes.
        :param min_mem_size: Minimum heap size (jvm option Xms), in bytes.
        :param ice_root: A directory where H2O stores its temporary files. Default location is determined by
            tempfile.mkdtemp().
        :param port: Port where to start the new server. This could be either an integer, or a string of the form
            "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up.
        :param verbose: If True, then connection info will be printed to the stdout.

        :returns: a new H2OLocalServer instance
        """
        assert jar_path is None or is_str(
            jar_path), "`jar_path` should be string, got %s" % type(jar_path)
        assert jar_path is None or jar_path.endswith("h2o.jar"), \
            "`jar_path` should be a path to an h2o.jar executable, got %s" % jar_path
        assert is_int(
            nthreads), "`nthreads` should be integer, got %s" % type(nthreads)
        assert nthreads == -1 or 1 <= nthreads <= 4096, "`nthreads` is out of bounds: %d" % nthreads
        assert isinstance(enable_assertions, bool), \
            "`enable_assertions` should be bool, got %s" % type(enable_assertions)
        assert max_mem_size is None or is_int(max_mem_size), \
            "`max_mem_size` should be integer, got %s" % type(max_mem_size)
        assert max_mem_size is None or max_mem_size >= 1 << 25, "`max_mem_size` too small: %d" % max_mem_size
        assert min_mem_size is None or is_int(min_mem_size), \
            "`min_mem_size` should be integer, got %s" % type(min_mem_size)
        assert min_mem_size is None or max_mem_size is None or min_mem_size <= max_mem_size, \
            "`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size)
        if ice_root:
            assert is_str(
                ice_root
            ), "`ice_root` should be string, got %r" % type(ice_root)
            assert os.path.isdir(
                ice_root), "`ice_root` is not a valid directory: %s" % ice_root
        if port is None: port = "54321+"
        baseport = None
        if is_str(port):
            if port.isdigit():
                port = int(port)
            else:
                assert port[-1] == "+" and port[:-1].isdigit(), \
                    "`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port
                baseport = int(port[:-1])
                port = 0
        assert is_int(
            port), "`port` should be integer (or string). Got: %s" % type(port)

        hs = H2OLocalServer()
        hs._verbose = bool(verbose)
        hs._jar_path = hs._find_jar(jar_path)
        hs._ice_root = ice_root
        if not ice_root:
            hs._ice_root = tempfile.mkdtemp()
            hs._tempdir = hs._ice_root

        if verbose: print("Attempting to start a local H2O server...")
        hs._launch_server(port=port,
                          baseport=baseport,
                          nthreads=int(nthreads),
                          ea=enable_assertions,
                          mmax=max_mem_size,
                          mmin=min_mem_size)
        if verbose:
            print("  Server is running at %s://%s:%d" %
                  (hs.scheme, hs.ip, hs.port))
        atexit.register(lambda: hs.shutdown())
        return hs