示例#1
0
    def _process_response(response, save_to):
        """
        Given a response object, prepare it to be handed over to the external caller.

        Preparation steps include:
           * detect if the response has error status, and convert it to an appropriate exception;
           * detect Content-Type, and based on that either parse the response as JSON or return as plain text.
        """
        status_code = response.status_code
        if status_code == 200 and save_to:
            if save_to.startswith("~"): save_to = os.path.expanduser(save_to)
            if os.path.isdir(save_to) or save_to.endswith(os.path.sep):
                dirname = os.path.abspath(save_to)
                filename = H2OConnection._find_file_name(response)
            else:
                dirname, filename = os.path.split(os.path.abspath(save_to))
            fullname = os.path.join(dirname, filename)
            try:
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(fullname, "wb") as f:
                    for chunk in response.iter_content(chunk_size=65536):
                        if chunk:  # Empty chunks may occasionally happen
                            f.write(chunk)
            except OSError as e:
                raise H2OValueError("Cannot write to file %s: %s" %
                                    (fullname, e))
            return fullname

        content_type = response.headers.get("Content-Type", "")
        if ";" in content_type:  # Remove a ";charset=..." part
            content_type = content_type[:content_type.index(";")]

        # Auto-detect response type by its content-type. Decode JSON, all other responses pass as-is.
        if content_type == "application/json":
            try:
                data = response.json(object_pairs_hook=H2OResponse)
            except (JSONDecodeError,
                    requests.exceptions.ContentDecodingError) as e:
                raise H2OServerError("Malformed JSON from server (%s):\n%s" %
                                     (str(e), response.text))
        else:
            data = response.text

        # Success (200 = "Ok", 201 = "Created", 202 = "Accepted", 204 = "No Content")
        if status_code in {200, 201, 202, 204}:
            return data

        # Client errors (400 = "Bad Request", 404 = "Not Found", 412 = "Precondition Failed")
        if status_code in {400, 404, 412} and isinstance(
                data, (H2OErrorV3, H2OModelBuilderErrorV3)):
            raise H2OResponseError(data)

        # Server errors (notably 500 = "Server Error")
        # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server
        # did not provide the correct status code.
        raise H2OServerError("HTTP %d %s:\n%r" %
                             (status_code, response.reason, data))
示例#2
0
    def _process_response(response):
        """
        Given a response object, prepare it to be handed over to the external caller.

        Preparation steps include:
           * detect if the response has error status, and convert it to an appropriate exception;
           * detect Content-Type, and based on that either parse the response as JSON or return as plain text.
        """
        content_type = response.headers[
            "Content-Type"] if "Content-Type" in response.headers else ""
        if ";" in content_type:  # Remove a ";charset=..." part
            content_type = content_type[:content_type.index(";")]
        status_code = response.status_code

        # Auto-detect response type by its content-type. Decode JSON, all other responses pass as-is.
        if content_type == "application/json":
            try:
                data = response.json(object_pairs_hook=H2OResponse)
            except (JSONDecodeError,
                    requests.exceptions.ContentDecodingError) as e:
                raise H2OServerError("Malformed JSON from server (%s):\n%s" %
                                     (str(e), response.text))
        else:
            data = response.text

        # Success (200 = "Ok", 201 = "Created", 202 = "Accepted", 204 = "No Content")
        if status_code in {200, 201, 202, 204}:
            return data

        # Client errors (400 = "Bad Request", 404 = "Not Found", 412 = "Precondition Failed")
        if status_code in {400, 404, 412} and isinstance(
                data, (H2OErrorV3, H2OModelBuilderErrorV3)):
            raise H2OResponseError(data)

        # Server errors (notably 500 = "Server Error")
        # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server
        # did not provide the correct status code.
        raise H2OServerError("HTTP %d %s:\n%r" %
                             (status_code, response.reason, data))
示例#3
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              blending_frame=None,
              verbose=False,
              **kwargs):
        has_training_frame = training_frame is not None or self.training_frame is not None
        blending_frame = H2OFrame._validate(blending_frame,
                                            'blending_frame',
                                            required=not has_training_frame)

        if not has_training_frame:
            training_frame = blending_frame  # used to bypass default checks in super class and backend and to guarantee default metrics

        sup = super(self.__class__, self)

        def extend_parms(parms):
            if blending_frame is not None:
                parms['blending_frame'] = blending_frame
            if self.metalearner_fold_column is not None:
                parms['ignored_columns'].remove(
                    quoted(self.metalearner_fold_column))

        parms = sup._make_parms(x,
                                y,
                                training_frame,
                                extend_parms_fn=extend_parms,
                                **kwargs)

        sup._train(parms, verbose=verbose)
        if self.metalearner() is None:
            raise H2OResponseError(
                "Meta learner didn't get to be trained in time. "
                "Try increasing max_runtime_secs or setting it to 0 (unlimited)."
            )
示例#4
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              leaderboard_frame=None,
              blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        # Minimal required arguments are training_frame and y (response)
        self.training_frame = training_frame

        ncols = self.training_frame.ncols
        names = self.training_frame.names

        if y is None and self.response_column is None:
            raise H2OValueError(
                'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.'
            )
        elif y is not None:
            assert_is_type(y, int, str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            self.response_column = y

        self.fold_column = fold_column
        self.weights_column = weights_column

        self.validation_frame = validation_frame
        self.leaderboard_frame = leaderboard_frame
        self.blending_frame = blending_frame

        if x is not None:
            assert_is_type(x, list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
            ignored_columns = set(names) - xset
            for col in [y, fold_column, weights_column]:
                if col is not None and col in ignored_columns:
                    ignored_columns.remove(col)
            if ignored_columns is not None:
                self.input_spec['ignored_columns'] = list(ignored_columns)

        def clean_params(params):
            return ({
                k: clean_params(v)
                for k, v in params.items() if v is not None
            } if isinstance(params, dict) else H2OEstimator._keyify(params))

        automl_build_params = clean_params(
            dict(
                build_control=self.build_control,
                build_models=self.build_models,
                input_spec=self.input_spec,
            ))

        resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder',
                                          json=automl_build_params)
        if 'job' not in resp:
            raise H2OResponseError(
                "Backend failed to build the AutoML job: {}".format(resp))

        if not self.project_name:
            self.project_name = resp['build_control']['project_name']
        self.__frozen = True

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates,
                                  verbosity=self._verbosity,
                                  state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
        return self.leader