Пример #1
0
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs["interactions"] = None if ("interactions" not in kwargs or kwargs["interactions"] is None) else [h2o.h2o._quoted(col) for col in kwargs["interactions"]]
    kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k])) for k in kwargs])  # gruesome one-liner
    algo = self._compute_algo()

    model = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo, **kwargs), job_type=(algo+" Model Build"))

    if self._future:
      self._job = model
      return

    model.poll()
    if '_rest_version' in list(kwargs.keys()): model_json = H2OConnection.get_json("Models/"+model.dest_key, _rest_version=kwargs['_rest_version'])["models"][0]
    else:                                model_json = H2OConnection.get_json("Models/"+model.dest_key)["models"][0]
    self._resolve_model(model.dest_key,model_json)
Пример #2
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Пример #3
0
  def deepfeatures(self, test_data, layer):
    """
    Return hidden layer details

    :param test_data: Data to create a feature space on
    :param layer: 0 index hidden layer
    """
    if test_data is None: raise ValueError("Must specify test data")
    j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data.frame_id, deep_features_hidden_layer=layer, _rest_version=4), "deepfeatures")
    j.poll()
    return h2o.get_frame(j.dest_key)
Пример #4
0
    def deepfeatures(self, test_data, layer):
        """
        Return hidden layer details.

        :param test_data: Data to create a feature space on
        :param layer: 0 index hidden layer
        """
        if test_data is None: raise ValueError("Must specify test data")
        j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self._id, test_data.frame_id),
                           data={"deep_features_hidden_layer": layer}), "deepfeatures")
        j.poll()
        return h2o.get_frame(j.dest_key)
Пример #5
0
    def predict(self, test_data):
        """
        Predict on a dataset.

        :param H2OFrame test_data: Data on which to make predictions.

        :returns: A new H2OFrame of predictions.
        """
        if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
        j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)),
                   self._model_json['algo'] + " prediction")
        j.poll()
        return h2o.get_frame(j.dest_key)
Пример #6
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if isinstance(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if isinstance(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            h2o.h2o._quoted(col) for col in ignored_columns
        ]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(
            kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs
                       if kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  #unique to grid search
        kwargs["_rest_version"] = 99  #unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id

        grid = H2OJob(H2OConnection.post_json("Grid/" + algo, **kwargs),
                      job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()
        if '_rest_version' in list(kwargs.keys()):
            grid_json = H2OConnection.get_json(
                "Grids/" + grid.dest_key,
                _rest_version=kwargs['_rest_version'])
            for error_message in grid_json["failure_details"]:
                print(error_message)
        else:
            grid_json = H2OConnection.get_json("Grids/" + grid.dest_key)

        self.models = [
            h2o.get_model(key['name']) for key in grid_json['model_ids']
        ]
        #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        first_model_json = H2OConnection.get_json(
            "Models/" + grid_json['model_ids'][0]['name'],
            _rest_version=kwargs['_rest_version'])['models'][0]

        self._resolve_grid(grid.dest_key, grid_json, first_model_json)
Пример #7
0
    def _train(self, parms, verbose=False):
        assert_is_type(verbose, bool)

        rest_ver = self._get_rest_version(parms)
        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = job
            self._rest_version = rest_ver
            return

        job.poll(poll_updates=self._print_model_scoring_history if verbose else None)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, job.dest_key))["models"][0]
        self._resolve_model(job.dest_key, model_json)
Пример #8
0
 def predict(self, test_data):
   """
   Predict on a dataset.
   
   Parameters
   ----------    
   test_data: H2OFrame
     Data on which to make predictions.
   
   Returns
   -------
     A new H2OFrame of predictions.
   """
   if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
   j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, _rest_version=4), self._model_json['algo'] + " prediction")
   j.poll()
   return h2o.get_frame(j.dest_key)
Пример #9
0
    def _train(self, parms, verbose=False):
        assert_is_type(verbose, bool)

        rest_ver = self._get_rest_version(parms)
        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if model_builder_json["messages"] is not None:
            for mesg in model_builder_json["messages"]:
                if mesg["message_type"] == "WARN":
                    warnings.warn(mesg["message"], RuntimeWarning)

        if self._future:
            self._job = job
            self._rest_version = rest_ver
            return

        job.poll(poll_updates=self._print_model_scoring_history if verbose else None)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, job.dest_key))["models"][0]
        self._resolve_model(job.dest_key, model_json)
Пример #10
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs["training_frame"] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            quoted(col) for col in ignored_columns
        ]
        kwargs["interactions"] = (None if "interactions" not in kwargs
                                  or kwargs["interactions"] is None else [
                                      quoted(col)
                                      for col in kwargs["interactions"]
                                  ])
        kwargs = {
            k: H2OEstimator._keyify_if_h2oframe(kwargs[k])
            for k in kwargs
        }
        rest_ver = kwargs.pop(
            "_rest_version") if "_rest_version" in kwargs else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" %
                               (rest_ver, self.algo),
                               data=kwargs),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" %
                             (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Пример #11
0
def store_h2o_frame(data, directory, filename, force=False, parts=1):
    """
    Export a given H2OFrame to a path on the machine this python session is currently connected to.

    :param data: the Frame to save to disk.
    :param directory: the directory to the save point on disk.
    :param filename: the name to save the frame to.
    :param force: if True, overwrite any preexisting file with the same path
    :param parts: enables export to multiple 'part' files instead of just a single file.
        Convenient for large datasets that take too long to store in a single file.
        Use parts=-1 to instruct H2O to determine the optimal number of part files or
        specify your desired maximum number of part files. Path needs to be a directory
        when exporting to multiple files, also that directory must be empty.
        Default is ``parts = 1``, which is to export to a single file.
    :return string filepath: the path to which the file was stored.
    """

    if not os.path.isdir(directory):
        os.makedirs(directory)

    filepath = _make_local_path(os.path.join(directory, filename))

    from h2o.job import H2OJob
    from h2o.utils.typechecks import assert_is_type
    from h2o.frame import H2OFrame
    from h2o import api
    assert_is_type(data, H2OFrame)
    assert_is_type(filepath, str)
    assert_is_type(force, bool)
    assert_is_type(parts, int)
    H2OJob(
        api("POST /3/Frames/%s/export" % (data.frame_id),
            data={
                "path": filepath,
                "num_parts": parts,
                "force": force
            }), "Export File").poll()

    return filepath
Пример #12
0
    def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False, extend_parms_fn=None):
        has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
        training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                            required=self._requires_training_frame() and not has_default_training_frame)
        validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)
        assert_is_type(extend_parms_fn, None, FunctionType)

        override_default_training_frame = training_frame is not None
        if not override_default_training_frame:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
            training_frame = self.training_frame if has_default_training_frame else None

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if algo=="pca" and "k" not in parms.keys():
            parms["k"] = 1
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"})

        names = training_frame.names if training_frame is not None else []
        ncols = training_frame.ncols if training_frame is not None else 0
        types = training_frame.types if training_frame is not None else {}

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if override_default_training_frame:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is None and "ignored_columns" in parms:
                ignored_columns = parms['ignored_columns']
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)
            self._check_and_save_parm(parms, "offset_column", offset_column)
            self._check_and_save_parm(parms, "weights_column", weights_column)
            self._check_and_save_parm(parms, "fold_column", fold_column)

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"}
        if is_auto_encoder and y is not None:
            raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None and self.algo not in ["generic"]:
            raise ValueError("Missing response")

        # Step 3
        if override_default_training_frame:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None:
            parms["validation_frame"] = validation_frame

        if is_type(y, int):
            y = names[y]
        if y is not None:
            parms["response_column"] = y
        if not isinstance(x, (list, tuple)):
            x = [x]
        if is_type(x[0], int):
            x = [names[i] for i in x]
        if override_default_training_frame:
            ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms)))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                                 else [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                      else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
    
        # internal hook allowing subclasses to extend train parms 
        if extend_parms_fn is not None:
            extend_parms_fn(parms)
            
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(poll_updates=self._print_model_scoring_history if verbose else None)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Пример #13
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if isinstance(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if isinstance(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            h2o.h2o._quoted(col) for col in ignored_columns
        ]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(
            kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs
                       if kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  #unique to grid search
        kwargs["_rest_version"] = 99  #unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id

        grid = H2OJob(H2OConnection.post_json("Grid/" + algo, **kwargs),
                      job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()
        if '_rest_version' in list(kwargs.keys()):
            grid_json = H2OConnection.get_json(
                "Grids/" + grid.dest_key,
                _rest_version=kwargs['_rest_version'])

            error_index = 0
            if len(grid_json["failure_details"]) > 0:
                print("Errors/Warnings building gridsearch model\n")

                for error_message in grid_json["failure_details"]:
                    if isinstance(grid_json["failed_params"][error_index],
                                  dict):
                        for h_name in grid_json['hyper_names']:
                            print("Hyper-parameter: {0}, {1}".format(
                                h_name, grid_json['failed_params'][error_index]
                                [h_name]))

                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(
                              error_message,
                              grid_json['failure_stack_traces'][error_index]))
                    error_index += 1
        else:
            grid_json = H2OConnection.get_json("Grids/" + grid.dest_key)

        self.models = [
            h2o.get_model(key['name']) for key in grid_json['model_ids']
        ]

        #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = H2OConnection.get_json(
                "Models/" + grid_json['model_ids'][0]['name'],
                _rest_version=kwargs['_rest_version'])['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            raise ValueError(
                "Gridsearch returns no model due to bad parameter values or other reasons...."
            )
Пример #14
0
    def train(self, x=None, y=None, training_frame=None, fold_column=None,
              weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True)
        ncols = training_frame.ncols
        names = training_frame.names

        # Minimal required arguments are training_frame and y (response)
        if y is None:
            raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame')
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if blending_frame is not None:
            blending_frame = H2OFrame._validate(blending_frame, 'blending_frame')
            input_spec['blending_frame'] = blending_frame.frame_id

        if self.sort_metric is not None:
            assert_is_type(self.sort_metric, str)
            sort_metric = self.sort_metric.lower()
            # Changed the API to use "deviance" to be consistent with stopping_metric values
            # TO DO: let's change the backend to use "deviance" since we use the term "deviance"
            # After that we can take this `if` statement out
            if sort_metric == "deviance":
                sort_metric = "mean_residual_deviance"
            input_spec['sort_metric'] = sort_metric

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None and fold_column in ignored_columns:
                ignored_columns.remove(fold_column)
            if weights_column is not None and weights_column in ignored_columns:
                ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec=input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control
        automl_build_params['build_models'] = self.build_models

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        if not self.project_name:
            self.build_control['project_name'] = self.project_name = resp['build_control']['project_name']

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
Пример #15
0
    def partial_plot(self, data, cols, destination_key=None, nbins=20, plot=True, figsize=(7,10), server=False):
        """
        Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
        response. The effect of a variable is measured in change in the mean response.

        :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
        :param cols: Feature(s) for which partial dependence will be calculated.
        :param destination_key: An key reference to the created partial dependence tables in H2O.
        :param nbins: Number of bins used.
        :param plot: A boolean specifying whether to plot partial dependence table.
        :param figsize: Dimension/size of the returning plots, adjust to fit your output cells.
        :param server: ?
        :return: Plot and list of calculated mean response tables for each feature requested.
        """

        if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame")
        assert_is_type(cols, [str])
        assert_is_type(destination_key, None, str)
        assert_is_type(nbins, int)
        assert_is_type(plot, bool)
        assert_is_type(figsize, (int,int))

        ## Check cols specified exist in frame data
        for xi in cols:
            if not xi in data.names:
                raise H2OValueError("Column %s does not exist in the training frame" % xi)

        kwargs = {}
        kwargs['cols'] = cols
        kwargs['model_id'] = self.model_id
        kwargs['frame_id'] = data.frame_id
        kwargs['nbins'] = nbins
        kwargs['destination_key'] = destination_key

        json = H2OJob(h2o.api("POST /3/PartialDependence/", data=kwargs),  job_type="PartialDependencePlot").poll()
        json = h2o.api("GET /3/PartialDependence/%s" % json.dest_key)

        # Extract partial dependence data from json response
        # pps = json
        pps = json['partial_dependence_data']

        ## Plot partial dependence plots using matplotlib
        if plot:
            plt = _get_matplotlib_pyplot(server)
            if not plt: return

            fig, axs = plt.subplots(len(cols), squeeze=False, figsize=figsize)
            for i, pp in enumerate(pps):
                ## Check weather column was categorical or numeric
                col=cols[i]
                cat=data[col].isfactor()[0]
                if cat:
                    labels = pp[0]
                    x = range(len(labels))
                    y = pp[1]
                    axs[i,0].plot(x, y, 'o')
                    axs[i,0].set_xticks(x)
                    axs[i,0].set_xticklabels(labels)
                    axs[i,0].margins(0.2)
                else:
                    axs[i,0].plot(pp[0], pp[1])
                    axs[i,0].set_xlim(min(pp[0]), max(pp[0]))

                axs[i,0].set_title('Partial Dependence Plot For {}'.format(col))
                axs[i,0].set_xlabel(pp.col_header[0])
                axs[i,0].set_ylabel(pp.col_header[1])
                axs[i,0].xaxis.grid()
                axs[i,0].yaxis.grid()
            if len(col) >1:
                fig.tight_layout(pad = 0.4,w_pad=0.5, h_pad=1.0)

        return pps
Пример #16
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        algo = self.algo
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"})
        ncols = training_frame.ncols
        names = training_frame.names
        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        elif y is not None:
            raise H2OValueError("y should not be provided for an unsupervised model")
        assert_is_type(y, str, None)
        ignored_columns_set = set()
        if ignored_columns is not None:
            if x is not None:
                raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
            for ic in ignored_columns:
                if is_type(ic, int):
                    if not (-ncols <= ic < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % ic)
                    ignored_columns_set.add(names[ic])
                else:
                    if ic not in names:
                        raise H2OValueError("Column %s not in the training frame" % ic)
                    ignored_columns_set.add(ic)
        if x is None:
            xset = set(names) - {y} - ignored_columns_set
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)

        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs
        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        parms["training_frame"] = training_frame
        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        offset = parms["offset_column"]
        folds = parms["fold_column"]
        weights = parms["weights_column"]
        ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
        parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Пример #17
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              leaderboard_frame=None,
              blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        # Minimal required arguments are training_frame and y (response)
        self.training_frame = training_frame

        ncols = self.training_frame.ncols
        names = self.training_frame.names

        if y is None and self.response_column is None:
            raise H2OValueError(
                'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.'
            )
        elif y is not None:
            assert_is_type(y, int, str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            self.response_column = y

        self.fold_column = fold_column
        self.weights_column = weights_column

        self.validation_frame = validation_frame
        self.leaderboard_frame = leaderboard_frame
        self.blending_frame = blending_frame

        if x is not None:
            assert_is_type(x, list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
            ignored_columns = set(names) - xset
            for col in [y, fold_column, weights_column]:
                if col is not None and col in ignored_columns:
                    ignored_columns.remove(col)
            if ignored_columns is not None:
                self.input_spec['ignored_columns'] = list(ignored_columns)

        def clean_params(params):
            return ({
                k: clean_params(v)
                for k, v in params.items() if v is not None
            } if isinstance(params, dict) else H2OEstimator._keyify(params))

        automl_build_params = clean_params(
            dict(
                build_control=self.build_control,
                build_models=self.build_models,
                input_spec=self.input_spec,
            ))

        resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder',
                                          json=automl_build_params)
        if 'job' not in resp:
            raise H2OResponseError(
                "Backend failed to build the AutoML job: {}".format(resp))

        if not self.project_name:
            self.project_name = resp['build_control']['project_name']
        self.__frozen = True

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates,
                                  verbosity=self._verbosity,
                                  state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
        return self.leader
Пример #18
0
    def train(self, x = None, y = None, training_frame = None, fold_column = None, 
              weights_column = None, validation_frame = None, leaderboard_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param leaderboard_frame: H2OFrame with test data to be scored on in the leaderboard.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch H2OAutoML
        >>> aml.train(y=y, training_frame=training_frame)
        """
        ncols = training_frame.ncols
        names = training_frame.names

        #Minimal required arguments are training_frame and y (response)
        if y is None:
            raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        if training_frame is None:
            raise ValueError('The training frame is not set!')
        else:
            assert_is_type(training_frame, H2OFrame)
            input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None: ignored_columns = ignored_columns.remove(fold_column)
            if weights_column is not None: ignored_columns = ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec = input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        self._job = H2OJob(resp['job'], "AutoML")
        self._automl_key = self._job.dest_key
        self._job.poll()
        self._fetch()
Пример #19
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        :param bool verbose: Print scoring history to stdout. Defaults to False.
        """

        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)

            parms["offset_column"] = offset_column
            parms["fold_column"] = fold_column
            parms["weights_column"] = weights_column

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])

        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Пример #20
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              validation_frame=None,
              test_frame=None):
        """
        Begins the automl task, which is a background task that incrementally improves
        over time. At any point, the user may use the "predict"/"performance"
        to inspect the incremental

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param test_frame: H2OFrame with test data to be scored on in the leaderboard.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> # Setting up an H2OAutoML object
        >>> build_control = {
        >>>              'stopping_criteria': {
        >>>              'stopping_rounds': 3,
        >>>              'stopping_tolerance': 0.001
        >>>            }
        >>>        }
        >>> aml = H2OAutoML(max_runtime_secs=30, build_control=build_control)
        >>> # Launch H2OAutoML
        >>> aml.train(y=y, training_frame=training_frame)
        """
        ncols = training_frame.ncols
        names = training_frame.names
        #Minimal required arguments are training_frame and y (response)
        if y is None:
            raise ValueError(
                'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.'
            )
        else:
            assert_is_type(y, int, str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        if training_frame is None:
            raise ValueError('The training frame is not set!')
        else:
            assert_is_type(training_frame, H2OFrame)
            input_spec['training_frame'] = training_frame.frame_id

        if validation_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['validation_frame'] = validation_frame.frame_id

        if test_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['test_frame'] = test_frame.frame_id

        if x is not None:
            assert_is_type(x, list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec=input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        self._job = H2OJob(resp['job'], "AutoML")
        self._automl_key = self._job.dest_key
        self._job.poll()
        self._fetch()
        if self.project_name is None:
            self.project_name = "automl_" + training_frame.frame_id
Пример #21
0
    def train_segments(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
                       weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
                       segments=None, segment_models_id=None, parallelism=1, verbose=False):
        """
        Trains H2O model for each segment (subpopulation) of the training dataset.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for each model training. Use 0 to disable.
            Please note that regardless of how this parameter is set, a model will be built for each input segment.
            This parameter only affects individual model training.
        :param segments: A list of columns to segment-by. H2O will group the training (and validation) dataset
            by the segment-by columns and train a separate model for each segment (group of rows).
            As an alternative to providing a list of columns, users can also supply an explicit enumeration of
            segments to build the models for. This enumeration needs to be represented as H2OFrame.
        :param segment_models_id: Identifier for the returned collection of Segment Models. If not specified
            it will be automatically generated.
        :param parallelism: Level of parallelism of the bulk segment models building, it is the maximum number 
            of models each H2O node will be building in parallel.
        :param bool verbose: Enable to print additional information during model building. Defaults to False.

        :examples:

        >>> response = "survived"
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> titanic[response] = titanic[response].asfactor()
        >>> predictors = ["survived","name","sex","age","sibsp","parch","ticket","fare","cabin"]
        >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
        >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
        >>> titanic_gbm = H2OGradientBoostingEstimator(seed=1234)
        >>> titanic_models = titanic_gbm.train_segments(segments=["pclass"],
        ...                                             x=predictors,
        ...                                             y=response,
        ...                                             training_frame=train,
        ...                                             validation_frame=valid)
        >>> titanic_models.as_frame()
        """
        assert_is_type(segments, None, H2OFrame, [str])
        assert_is_type(verbose, bool)
        assert_is_type(segment_models_id, None, str)
        assert_is_type(parallelism, int)

        if segments is None:
            raise H2OValueError("Parameter segments was not specified. Please provide either a list of columns to "
                                "segment-by or an explicit list of segments to build models for.")

        parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column,
                                 fold_column=fold_column, weights_column=weights_column,
                                 validation_frame=validation_frame, max_runtime_secs=max_runtime_secs,
                                 ignored_columns=ignored_columns, model_id=None, verbose=verbose)

        if isinstance(segments, H2OFrame):
            parms["segments"] = H2OEstimator._keyify(segments)
        else:
            parms["segment_columns"] = segments
        if segment_models_id:
            parms["segment_models_id"] = segment_models_id
        parms["parallelism"] = parallelism

        rest_ver = self._get_rest_version(parms)
        train_segments_response = h2o.api("POST /%d/SegmentModelsBuilders/%s" % (rest_ver, self.algo), data=parms)
        job = H2OJob(train_segments_response, job_type=(self.algo + " Segment Models Build"))
        job.poll()
        return H2OSegmentModels(job.dest_key)