示例#1
0
 def __init__(self, **kwargs):
     super(H2OGradientBoostingEstimator, self).__init__()
     self._parms = {}
     names_list = {
         "model_id", "training_frame", "validation_frame", "nfolds",
         "keep_cross_validation_models",
         "keep_cross_validation_predictions",
         "keep_cross_validation_fold_assignment", "score_each_iteration",
         "score_tree_interval", "fold_assignment", "fold_column",
         "response_column", "ignored_columns", "ignore_const_cols",
         "offset_column", "weights_column", "balance_classes",
         "class_sampling_factors", "max_after_balance_size",
         "max_confusion_matrix_size", "max_hit_ratio_k", "ntrees",
         "max_depth", "min_rows", "nbins", "nbins_top_level", "nbins_cats",
         "r2_stopping", "stopping_rounds", "stopping_metric",
         "stopping_tolerance", "max_runtime_secs", "seed",
         "build_tree_one_node", "learn_rate", "learn_rate_annealing",
         "distribution", "quantile_alpha", "tweedie_power", "huber_alpha",
         "checkpoint", "sample_rate", "sample_rate_per_class",
         "col_sample_rate", "col_sample_rate_change_per_level",
         "col_sample_rate_per_tree", "min_split_improvement",
         "histogram_type", "max_abs_leafnode_pred", "pred_noise_bandwidth",
         "categorical_encoding", "calibrate_model", "calibration_frame",
         "custom_metric_func", "export_checkpoints_dir",
         "monotone_constraints"
     }
     if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda")
     for pname, pvalue in kwargs.items():
         if pname == 'model_id':
             self._id = pvalue
             self._parms["model_id"] = pvalue
         elif pname in names_list:
             # Using setattr(...) will invoke type-checking of the arguments
             setattr(self, pname, pvalue)
         else:
             raise H2OValueError("Unknown parameter %s = %r" %
                                 (pname, pvalue))
示例#2
0
 def __init__(self, **kwargs):
     super(H2ODeepWaterEstimator, self).__init__()
     self._parms = {}
     names_list = {
         "model_id", "checkpoint", "autoencoder", "training_frame",
         "validation_frame", "nfolds", "balance_classes",
         "max_after_balance_size", "class_sampling_factors",
         "keep_cross_validation_predictions",
         "keep_cross_validation_fold_assignment", "fold_assignment",
         "fold_column", "response_column", "offset_column",
         "weights_column", "ignored_columns", "score_each_iteration",
         "categorical_encoding", "overwrite_with_best_model", "epochs",
         "train_samples_per_iteration", "target_ratio_comm_to_comp", "seed",
         "standardize", "learning_rate", "learning_rate_annealing",
         "momentum_start", "momentum_ramp", "momentum_stable",
         "distribution", "score_interval", "score_training_samples",
         "score_validation_samples", "score_duty_cycle",
         "classification_stop", "regression_stop", "stopping_rounds",
         "stopping_metric", "stopping_tolerance", "max_runtime_secs",
         "ignore_const_cols", "shuffle_training_data", "mini_batch_size",
         "clip_gradient", "network", "backend", "image_shape", "channels",
         "sparse", "gpu", "device_id", "network_definition_file",
         "network_parameters_file", "mean_image_file",
         "export_native_parameters_prefix", "activation", "hidden",
         "input_dropout_ratio", "hidden_dropout_ratios", "problem_type"
     }
     if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda")
     for pname, pvalue in kwargs.items():
         if pname == 'model_id':
             self._id = pvalue
             self._parms["model_id"] = pvalue
         elif pname in names_list:
             # Using setattr(...) will invoke type-checking of the arguments
             setattr(self, pname, pvalue)
         else:
             raise H2OValueError("Unknown parameter %s = %r" %
                                 (pname, pvalue))
示例#3
0
 def __init__(self, **kwargs):
     super(H2OWord2vecEstimator, self).__init__()
     self._parms = {}
     names_list = {
         "model_id", "training_frame", "min_word_freq", "word_model",
         "norm_model", "vec_size", "window_size", "sent_sample_rate",
         "init_learning_rate", "epochs", "pre_trained", "max_runtime_secs",
         "export_checkpoints_dir"
     }
     if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda")
     for pname, pvalue in kwargs.items():
         if pname == 'model_id':
             self._id = pvalue
             self._parms["model_id"] = pvalue
         elif pname == 'pre_trained':
             setattr(self, pname, pvalue)
             self._determine_vec_size()
             setattr(self, 'vec_size', self.vec_size)
         elif pname in names_list:
             # Using setattr(...) will invoke type-checking of the arguments
             setattr(self, pname, pvalue)
         else:
             raise H2OValueError("Unknown parameter %s = %r" %
                                 (pname, pvalue))
示例#4
0
 def __init__(self, **kwargs):
     super(H2OXGBoostEstimator, self).__init__()
     self._parms = {}
     names_list = {"model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_predictions",
                   "keep_cross_validation_fold_assignment", "score_each_iteration", "fold_assignment", "fold_column",
                   "response_column", "ignored_columns", "ignore_const_cols", "offset_column", "weights_column",
                   "stopping_rounds", "stopping_metric", "stopping_tolerance", "max_runtime_secs", "seed",
                   "distribution", "tweedie_power", "categorical_encoding", "quiet_mode", "ntrees", "max_depth",
                   "min_rows", "min_child_weight", "learn_rate", "eta", "sample_rate", "subsample",
                   "col_sample_rate", "colsample_bylevel", "col_sample_rate_per_tree", "colsample_bytree",
                   "max_abs_leafnode_pred", "max_delta_step", "score_tree_interval", "min_split_improvement",
                   "gamma", "nthread", "max_bins", "max_leaves", "min_sum_hessian_in_leaf", "min_data_in_leaf",
                   "sample_type", "normalize_type", "rate_drop", "one_drop", "skip_drop", "tree_method",
                   "grow_policy", "booster", "reg_lambda", "reg_alpha", "dmatrix_type", "backend", "gpu_id"}
     if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda")
     for pname, pvalue in kwargs.items():
         if pname == 'model_id':
             self._id = pvalue
             self._parms["model_id"] = pvalue
         elif pname in names_list:
             # Using setattr(...) will invoke type-checking of the arguments
             setattr(self, pname, pvalue)
         else:
             raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
示例#5
0
文件: kmeans.py 项目: ysjyang/h2o-3
 def __init__(self, **kwargs):
     super(H2OKMeansEstimator, self).__init__()
     self._parms = {}
     names_list = {
         "model_id", "training_frame", "validation_frame", "nfolds",
         "keep_cross_validation_models",
         "keep_cross_validation_predictions",
         "keep_cross_validation_fold_assignment", "fold_assignment",
         "fold_column", "ignored_columns", "ignore_const_cols",
         "score_each_iteration", "k", "estimate_k", "user_points",
         "max_iterations", "standardize", "seed", "init",
         "max_runtime_secs", "categorical_encoding"
     }
     if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda")
     for pname, pvalue in kwargs.items():
         if pname == 'model_id':
             self._id = pvalue
             self._parms["model_id"] = pvalue
         elif pname in names_list:
             # Using setattr(...) will invoke type-checking of the arguments
             setattr(self, pname, pvalue)
         else:
             raise H2OValueError("Unknown parameter %s = %r" %
                                 (pname, pvalue))
示例#6
0
 def __init__(self, **kwargs):
     super(H2OGeneralizedLinearEstimator, self).__init__()
     self._parms = {}
     names_list = {"model_id", "training_frame", "validation_frame", "nfolds", "seed",
                   "keep_cross_validation_models", "keep_cross_validation_predictions",
                   "keep_cross_validation_fold_assignment", "fold_assignment", "fold_column", "response_column",
                   "ignored_columns", "ignore_const_cols", "score_each_iteration", "offset_column", "weights_column",
                   "family", "tweedie_variance_power", "tweedie_link_power", "theta", "solver", "alpha", "lambda_",
                   "lambda_search", "early_stopping", "nlambdas", "standardize", "missing_values_handling",
                   "compute_p_values", "remove_collinear_columns", "intercept", "non_negative", "max_iterations",
                   "objective_epsilon", "beta_epsilon", "gradient_epsilon", "link", "prior", "lambda_min_ratio",
                   "beta_constraints", "max_active_predictors", "interactions", "interaction_pairs", "obj_reg",
                   "export_checkpoints_dir", "balance_classes", "class_sampling_factors", "max_after_balance_size",
                   "max_confusion_matrix_size", "max_hit_ratio_k", "max_runtime_secs", "custom_metric_func"}
     if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda")
     for pname, pvalue in kwargs.items():
         if pname == 'model_id':
             self._id = pvalue
             self._parms["model_id"] = pvalue
         elif pname in names_list:
             # Using setattr(...) will invoke type-checking of the arguments
             setattr(self, pname, pvalue)
         else:
             raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
示例#7
0
    def varimp_plot(self, num_of_features=None, server=False):
        """
        Plot the variable importance for a trained model.

        :param num_of_features: the number of features shown in the plot.
        :param server: ?

        :returns: None.
        """
        assert_is_type(num_of_features, None, int)
        assert_is_type(server, bool)

        plt = _get_matplotlib_pyplot(server)
        if not plt: return

        # check if the model is a glm
        if self._model_json["algo"] == "glm":
            # print statement to used std_coef_plot(), and use std_coef_plt instead
            print("Variable importance does not apply to GLM. Will use std_coef_plot() instead.")
            self.std_coef_plot(num_of_features)
            return

        # get the variable importances as a list of tuples, do not use pandas dataframe
        importances = self.varimp(use_pandas=False)
        # features labels correspond to the first value of each tuple in the importances list
        feature_labels = [tup[0] for tup in importances]
        # relative importances correspond to the first value of each tuple in the importances list
        scaled_importances = [tup[2] for tup in importances]
        # specify bar centers on the y axis, but flip the order so largest bar appears at top
        pos = range(len(feature_labels))[::-1]
        # specify the bar lengths
        val = scaled_importances

        # check that num_of_features is an integer
        if num_of_features is None:
            num_of_features = len(val)

        fig, ax = plt.subplots(1, 1, figsize=(14, 10))
        # create separate plot for the case where num_of_features == 1
        if num_of_features == 1:
            plt.barh(pos[0:num_of_features], val[0:num_of_features], align="center",
                     height=0.8, color="#1F77B4", edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
            ax.margins(y=0.5)

        else:
            plt.barh(pos[0:num_of_features], val[0:num_of_features], align="center",
                     height=0.8, color="#1F77B4", edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
            ax.margins(y=0.5)

        # check which algorithm was used to select right plot title
        if self._model_json["algo"] == "gbm":
            plt.title("Variable Importance: H2O GBM", fontsize=20)
            if not server: plt.show()
        elif self._model_json["algo"] == "drf":
            plt.title("Variable Importance: H2O DRF", fontsize=20)
            if not server: plt.show()
        # if H2ODeepLearningEstimator has variable_importances == True
        elif self._model_json["algo"] == "deeplearning":
            plt.title("Variable Importance: H2O Deep Learning", fontsize=20)
            if not server: plt.show()
        else:
            raise H2OValueError("A variable importances plot is not implemented for this type of model")
示例#8
0
    def partial_plot(self, data, cols, destination_key=None, nbins=20, plot=True, figsize=(7,10), server=False):
        """
        Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
        response. The effect of a variable is measured in change in the mean response.

        :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
        :param cols: Feature(s) for which partial dependence will be calculated.
        :param destination_key: An key reference to the created partial dependence tables in H2O.
        :param nbins: Number of bins used.
        :param plot: A boolean specifying whether to plot partial dependence table.
        :param figsize: Dimension/size of the returning plots, adjust to fit your output cells.
        :param server: ?
        :return: Plot and list of calculated mean response tables for each feature requested.
        """

        if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame")
        assert_is_type(cols, [str])
        assert_is_type(destination_key, None, str)
        assert_is_type(nbins, int)
        assert_is_type(plot, bool)
        assert_is_type(figsize, (int,int))

        ## Check cols specified exist in frame data
        for xi in cols:
            if not xi in data.names:
                raise H2OValueError("Column %s does not exist in the training frame" % xi)

        kwargs = {}
        kwargs['cols'] = cols
        kwargs['model_id'] = self.model_id
        kwargs['frame_id'] = data.frame_id
        kwargs['nbins'] = nbins
        kwargs['destination_key'] = destination_key

        json = H2OJob(h2o.api("POST /3/PartialDependence/", data=kwargs),  job_type="PartialDependencePlot").poll()
        json = h2o.api("GET /3/PartialDependence/%s" % json.dest_key)

        # Extract partial dependence data from json response
        # pps = json
        pps = json['partial_dependence_data']

        ## Plot partial dependence plots using matplotlib
        if plot:
            plt = _get_matplotlib_pyplot(server)
            if not plt: return

            fig, axs = plt.subplots(len(cols), squeeze=False, figsize=figsize)
            for i, pp in enumerate(pps):
                ## Check weather column was categorical or numeric
                col=cols[i]
                cat=data[col].isfactor()[0]
                if cat:
                    labels = pp[0]
                    x = range(len(labels))
                    y = pp[1]
                    axs[i,0].plot(x, y, 'o')
                    axs[i,0].set_xticks(x)
                    axs[i,0].set_xticklabels(labels)
                    axs[i,0].margins(0.2)
                else:
                    axs[i,0].plot(pp[0], pp[1])
                    axs[i,0].set_xlim(min(pp[0]), max(pp[0]))

                axs[i,0].set_title('Partial Dependence Plot For {}'.format(col))
                axs[i,0].set_xlabel(pp.col_header[0])
                axs[i,0].set_ylabel(pp.col_header[1])
                axs[i,0].xaxis.grid()
                axs[i,0].yaxis.grid()
            if len(col) >1:
                fig.tight_layout(pad = 0.4,w_pad=0.5, h_pad=1.0)

        return pps
示例#9
0
    def _plot(self, timestep, metric, server=False):
        plt = _get_matplotlib_pyplot(server)
        if not plt: return

        scoring_history = self.scoring_history()
        # Separate functionality for GLM since its output is different from other algos
        if self._model_json["algo"] == "glm":
            # GLM has only one timestep option, which is `iteration`
            timestep = "iteration"
            if metric == "AUTO":
                metric = "log_likelihood"
            elif metric not in ("log_likelihood", "objective"):
                raise H2OValueError("for GLM, metric must be one of: log_likelihood, objective")
            plt.xlabel(timestep)
            plt.ylabel(metric)
            plt.title("Validation Scoring History")
            plt.plot(scoring_history[timestep], scoring_history[metric])

        elif self._model_json["algo"] in ("deeplearning", "deepwater", "drf", "gbm"):
            # Set timestep
            if self._model_json["algo"] in ("gbm", "drf"):
                assert_is_type(timestep, "AUTO", "duration", "number_of_trees")
                if timestep == "AUTO":
                    timestep = "number_of_trees"
            else:  # self._model_json["algo"] == "deeplearning":
                # Delete first row of DL scoring history since it contains NAs & NaNs
                if scoring_history["samples"][0] == 0:
                    scoring_history = scoring_history[1:]
                assert_is_type(timestep, "AUTO", "epochs",  "samples", "duration")
                if timestep == "AUTO":
                    timestep = "epochs"

            training_metric = "training_{}".format(metric)
            validation_metric = "validation_{}".format(metric)
            if timestep == "duration":
                dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1])
                scoring_history[dur_colname] = [str(x).split()[0] for x in scoring_history["duration"]]
                timestep = dur_colname

            if can_use_pandas():
                valid = validation_metric in list(scoring_history)
                ylim = (scoring_history[[training_metric, validation_metric]].min().min(),
                        scoring_history[[training_metric, validation_metric]].max().max()) if valid \
                    else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
            else:
                valid = validation_metric in scoring_history.col_header
                ylim = (min(min(scoring_history[[training_metric, validation_metric]])),
                        max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
                    else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))
            if ylim[0] == ylim[1]: ylim = (0, 1)

            if valid:  # Training and validation scoring history
                plt.xlabel(timestep)
                plt.ylabel(metric)
                plt.title("Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training")
                plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange",
                         label="Validation")
                plt.legend()
            else:  # Training scoring history only
                plt.xlabel(timestep)
                plt.ylabel(training_metric)
                plt.title("Training Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep], scoring_history[training_metric])

        else:  # algo is not glm, deeplearning, drf, gbm
            raise H2OValueError("Plotting not implemented for this type of model")
        if not server: plt.show()
示例#10
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              leaderboard_frame=None,
              blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        # Minimal required arguments are training_frame and y (response)
        self.training_frame = training_frame

        ncols = self.training_frame.ncols
        names = self.training_frame.names

        if y is None and self.response_column is None:
            raise H2OValueError(
                'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.'
            )
        elif y is not None:
            assert_is_type(y, int, str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            self.response_column = y

        self.fold_column = fold_column
        self.weights_column = weights_column

        self.validation_frame = validation_frame
        self.leaderboard_frame = leaderboard_frame
        self.blending_frame = blending_frame

        if x is not None:
            assert_is_type(x, list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
            ignored_columns = set(names) - xset
            for col in [y, fold_column, weights_column]:
                if col is not None and col in ignored_columns:
                    ignored_columns.remove(col)
            if ignored_columns is not None:
                self.input_spec['ignored_columns'] = list(ignored_columns)

        def clean_params(params):
            return ({
                k: clean_params(v)
                for k, v in params.items() if v is not None
            } if isinstance(params, dict) else H2OEstimator._keyify(params))

        automl_build_params = clean_params(
            dict(
                build_control=self.build_control,
                build_models=self.build_models,
                input_spec=self.input_spec,
            ))

        resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder',
                                          json=automl_build_params)
        if 'job' not in resp:
            raise H2OResponseError(
                "Backend failed to build the AutoML job: {}".format(resp))

        if not self.project_name:
            self.project_name = resp['build_control']['project_name']
        self.__frozen = True

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates,
                                  verbosity=self._verbosity,
                                  state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
        return self.leader
示例#11
0
    def __init__(self,
                 algorithm,
                 min_rule_len=1,
                 max_rule_len=10,
                 max_num_rules=None,
                 nfolds=5,
                 seed=-1,
                 tree_params={},
                 glm_params={}):

        if algorithm not in ["DRF", "XGBoost", "GBM"]:
            raise H2OValueError(
                "{} is not a supported algorithm".format(algorithm))
        self.algorithm = algorithm
        self.min_rule_len = min_rule_len
        self.max_rule_len = max_rule_len
        self.max_num_rules = max_num_rules
        self.nfolds = nfolds
        self.seed = seed

        if tree_params:
            tree_params.pop("model_id", None)
            if 'max_depth' in tree_params.keys():
                self.min_rule_len = tree_params.get("max_depth")
                self.max_rule_len = tree_params.get("max_depth")
                tree_params.pop("max_depth")
                warnings.warn(
                    'max_depth provided in tree_params - min_rule_len and max_rule_len will be ignored'
                )
            if 'nfolds' in tree_params.keys():
                tree_params.pop('nfolds')
                warnings.warn(
                    'seed provided in tree_params but will be ignored')
            if 'seed' in tree_params.keys():
                tree_params.pop('seed')
                warnings.warn(
                    'seed provided in tree_params but will be ignored')

        if glm_params:
            glm_params.pop("model_id", None)
            if 'max_active_predictors' in glm_params.keys():
                self.max_num_rules = glm_params.get(
                    "max_active_predictors") - 1
                glm_params.pop("max_active_predictors")
                warnings.warn(
                    'max_active_predictors provided in glm_params - max_num_rules will be ignored'
                )
            if 'nfolds' in glm_params.keys():
                glm_params.pop('nfolds')
                warnings.warn(
                    'seed provided in glm_params but will be ignored')
            if 'seed' in glm_params.keys():
                glm_params.pop('seed')
                warnings.warn(
                    'seed provided in glm_params but will be ignored')
            if 'alpha' in glm_params.keys():
                glm_params.pop('alpha')
                warnings.warn('alpha ignored - set to 1 by rulefit')
            if 'lambda_' in glm_params.keys():
                glm_params.pop('lambda_')
                warnings.warn('lambda_ ignored by rulefit')

        self.tree_params = tree_params
        self.glm_params = glm_params
示例#12
0
    def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False, extend_parms_fn=None):
        has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
        training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                            required=self._requires_training_frame() and not has_default_training_frame)
        validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)
        assert_is_type(extend_parms_fn, None, FunctionType)

        override_default_training_frame = training_frame is not None
        if not override_default_training_frame:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
            training_frame = self.training_frame if has_default_training_frame else None

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if algo=="pca" and "k" not in parms.keys():
            parms["k"] = 1
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"})

        names = training_frame.names if training_frame is not None else []
        ncols = training_frame.ncols if training_frame is not None else 0
        types = training_frame.types if training_frame is not None else {}

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if override_default_training_frame:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is None and "ignored_columns" in parms:
                ignored_columns = parms['ignored_columns']
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)
            self._check_and_save_parm(parms, "offset_column", offset_column)
            self._check_and_save_parm(parms, "weights_column", weights_column)
            self._check_and_save_parm(parms, "fold_column", fold_column)

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"}
        if is_auto_encoder and y is not None:
            raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None and self.algo not in ["generic"]:
            raise ValueError("Missing response")

        # Step 3
        if override_default_training_frame:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None:
            parms["validation_frame"] = validation_frame

        if is_type(y, int):
            y = names[y]
        if y is not None:
            parms["response_column"] = y
        if not isinstance(x, (list, tuple)):
            x = [x]
        if is_type(x[0], int):
            x = [names[i] for i in x]
        if override_default_training_frame:
            ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms)))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                                 else [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                      else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
    
        # internal hook allowing subclasses to extend train parms 
        if extend_parms_fn is not None:
            extend_parms_fn(parms)
            
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(poll_updates=self._print_model_scoring_history if verbose else None)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
示例#13
0
    def train(self, x=None, y=None, training_frame=None, fold_column=None,
              weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True)
        ncols = training_frame.ncols
        names = training_frame.names

        # Minimal required arguments are training_frame and y (response)
        if y is None:
            raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame')
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if blending_frame is not None:
            blending_frame = H2OFrame._validate(blending_frame, 'blending_frame')
            input_spec['blending_frame'] = blending_frame.frame_id

        if self.sort_metric is not None:
            assert_is_type(self.sort_metric, str)
            sort_metric = self.sort_metric.lower()
            # Changed the API to use "deviance" to be consistent with stopping_metric values
            # TO DO: let's change the backend to use "deviance" since we use the term "deviance"
            # After that we can take this `if` statement out
            if sort_metric == "deviance":
                sort_metric = "mean_residual_deviance"
            input_spec['sort_metric'] = sort_metric

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None and fold_column in ignored_columns:
                ignored_columns.remove(fold_column)
            if weights_column is not None and weights_column in ignored_columns:
                ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec=input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control
        automl_build_params['build_models'] = self.build_models

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        if not self.project_name:
            self.build_control['project_name'] = self.project_name = resp['build_control']['project_name']

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
示例#14
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              offset_column=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              max_runtime_secs=None,
              ignored_columns=None,
              **ignored):
        """
        Train the H2O model.

        Parameters
        ----------
        x : list, None
            A list of column names or indices indicating the predictor columns.

        y :
            An index or a column name indicating the response column.

        training_frame : H2OFrame
            The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).

        offset_column : str, optional
            The name or index of the column in training_frame that holds the offsets.

        fold_column : str, optional
            The name or index of the column in training_frame that holds the per-row fold
            assignments.

        weights_column : str, optional
            The name or index of the column in training_frame that holds the per-row weights.

        validation_frame : H2OFrame, optional
            H2OFrame with validation data to be scored on while training.

        max_runtime_secs : float
            Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        algo = self.algo
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not (is_auto_encoder
                             or algo in {"pca", "svd", "kmeans", "glrm"})
        ncols = training_frame.ncols
        names = training_frame.names
        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[
                y] == "enum" else "regressor"
        elif y is not None:
            raise H2OValueError(
                "y should not be provided for an unsupervised model")
        assert_is_type(y, str, None)
        ignored_columns_set = set()
        if ignored_columns is not None:
            if x is not None:
                raise H2OValueError(
                    "Properties x and ignored_columns cannot be specified simultaneously"
                )
            for ic in ignored_columns:
                if is_type(ic, int):
                    if not (-ncols <= ic < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            ic)
                    ignored_columns_set.add(names[ic])
                else:
                    if ic not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % ic)
                    ignored_columns_set.add(ic)
        if x is None:
            xset = set(names) - {y} - ignored_columns_set
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)

        parms["x"] = list(xset)
        parms["y"] = y
        parms["training_frame"] = training_frame
        parms["validation_frame"] = validation_frame
        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs
        self._build_model(parms)
示例#15
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              offset_column=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              max_runtime_secs=None,
              **params):
        """
        Train the H2O model.

        Parameters
        ----------
        x : list, None
            A list of column names or indices indicating the predictor columns.

        y : str, int
            An index or a column name indicating the response column.

        training_frame : H2OFrame
            The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).

        offset_column : str, optional
            The name or index of the column in training_frame that holds the offsets.

        fold_column : str, optional
            The name or index of the column in training_frame that holds the per-row fold
            assignments.

        weights_column : str, optional
            The name or index of the column in training_frame that holds the per-row weights.

        validation_frame : H2OFrame, optional
            H2OFrame with validation data to be scored on while training.

        max_runtime_secs : float
            Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        algo = self._compute_algo()
        algo_params = locals()
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not (is_auto_encoder
                             or algo in {"pca", "svd", "kmeans", "glrm"})
        if y is None:
            if is_supervised and "response" in training_frame.names:
                y = "response"
        else:
            if is_auto_encoder:
                raise H2OValueError(
                    "y should not be provided for an autoencoder model")
            if isinstance(y, (list, tuple)):
                if len(y) == 1: parms["y"] = y[0]
                else: raise ValueError("y must be a single column reference")
            self._estimator_type = "classifier" if training_frame[y].isfactor(
            ) else "regressor"
        if x is None:
            x = set(training_frame.names)
            if is_type(y, int): x -= {training_frame.names[y]}
            if is_type(y, str): x -= {y}
            x = list(x)
        parms["x"] = x
        parms["y"] = y
        parms["training_frame"] = training_frame
        parms["validation_frame"] = validation_frame
        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs
        self.build_model(parms)
示例#16
0
    def start(jar_path=None,
              nthreads=-1,
              enable_assertions=True,
              max_mem_size=None,
              min_mem_size=None,
              ice_root=None,
              port="54321+",
              extra_classpath=None,
              verbose=True):
        """
        Start new H2O server on the local machine.

        :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the
            locations returned by `._jar_paths()`.
        :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used.
            -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly.
        :param enable_assertions: If True, pass `-ea` option to the JVM.
        :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes.
        :param min_mem_size: Minimum heap size (jvm option Xms), in bytes.
        :param ice_root: A directory where H2O stores its temporary files. Default location is determined by
            tempfile.mkdtemp().
        :param port: Port where to start the new server. This could be either an integer, or a string of the form
            "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up.
        :param extra_classpath List of paths to libraries that should be included on the Java classpath.
        :param verbose: If True, then connection info will be printed to the stdout.

        :returns: a new H2OLocalServer instance
        """
        assert_is_type(jar_path, None, str)
        assert_is_type(port, None, int, str)
        assert_is_type(nthreads, -1, BoundInt(1, 4096))
        assert_is_type(enable_assertions, bool)
        assert_is_type(min_mem_size, None, int)
        assert_is_type(max_mem_size, None, BoundInt(1 << 25))
        assert_is_type(ice_root, None, I(str, os.path.isdir))
        assert_is_type(extra_classpath, None, [str])
        if jar_path:
            assert_satisfies(jar_path, jar_path.endswith("h2o.jar"))

        if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size:
            raise H2OValueError(
                "`min_mem_size`=%d is larger than the `max_mem_size`=%d" %
                (min_mem_size, max_mem_size))
        if port is None: port = "54321+"
        baseport = None
        # TODO: get rid of this port gimmick and have 2 separate parameters.
        if is_type(port, str):
            if port.isdigit():
                port = int(port)
            else:
                if not (port[-1] == "+" and port[:-1].isdigit()):
                    raise H2OValueError(
                        "`port` should be of the form 'DDDD+', where D is a digit. Got: %s"
                        % port)
                baseport = int(port[:-1])
                port = 0

        hs = H2OLocalServer()
        hs._verbose = bool(verbose)
        hs._jar_path = hs._find_jar(jar_path)
        hs._extra_classpath = extra_classpath
        hs._ice_root = ice_root
        if not ice_root:
            hs._ice_root = tempfile.mkdtemp()
            hs._tempdir = hs._ice_root

        if verbose: print("Attempting to start a local H2O server...")
        hs._launch_server(port=port,
                          baseport=baseport,
                          nthreads=int(nthreads),
                          ea=enable_assertions,
                          mmax=max_mem_size,
                          mmin=min_mem_size)
        if verbose:
            print("  Server is running at %s://%s:%d" %
                  (hs.scheme, hs.ip, hs.port))
        atexit.register(lambda: hs.shutdown())
        return hs
示例#17
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
              validation_frame=None, **params):
        """
        Train the rulfit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """
        family = "gaussian"
        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                raise H2OValueError("Multinomial not supported")
            else:
                family = "binomial"


        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_depth, self.max_depth + 1)
        rf_models = []
        for model_idx in range(len(depths)):

            # Train random forest models
            rf_model = H2ORandomForestEstimator(seed = self.seed, 
                                                model_id = "rf.hex", 
                                                max_depth = depths[model_idx])
            rf_model.train(y = y, x = x, training_frame = training_frame)
            rf_models = rf_models + [rf_model]

            paths = rf_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = ["rf_" + str(model_idx) +"."+ x for x in paths.col_names]
            paths_frame = paths_frame.cbind(paths)

        # Extract important paths
        glm = H2OGeneralizedLinearEstimator(model_id = "glm.hex", 
                                            nfolds = self.nfolds, 
                                            seed = self.seed,
                                            family = family,
                                            alpha = 1, 
                                            remove_collinear_columns=True,
                                            lambda_search = True)
        glm.train(y = y, training_frame=paths_frame)

        intercept, rule_importance = _get_glm_coeffs(glm)
        rule_importance = pd.DataFrame.from_dict(rule_importance, orient = "index").reset_index()
        rule_importance.columns = ["variable", "coefficient"]

        # Convert paths to rules
        rules = []
        for i in rule_importance.variable:
            if family == "binomial":
                model_num, tree_num, path = i.replace("rf_", "").replace("T", "").replace("C1.", "").split(".")
            else:
                model_num, tree_num, path = i.replace("rf_", "").replace("T", "").split(".")
            tree = H2OTree(rf_models[int(model_num)], int(tree_num)-1)
            rules = rules + [_tree_traverser(tree.root_node, path)]

        # Add rules and order by absolute coefficient
        rule_importance["rule"] = rules
        rule_importance["abs_coefficient"] = rule_importance["coefficient"].abs()
        rule_importance = rule_importance.loc[rule_importance.groupby(["rule"])["abs_coefficient"].idxmax()]  
        rule_importance = rule_importance.sort_values(by = "abs_coefficient", ascending = False)
        rule_importance = rule_importance.drop("abs_coefficient", axis = 1)
        
        self.intercept = intercept
        self.rule_importance = rule_importance
示例#18
0
def assert_true(cond, message):
    """Same as traditional assert, only raises H2OValueError instead."""
    if not cond:
        raise H2OValueError(message)
示例#19
0
    def std_coef_plot(self, num_of_features=None, server=False):
        """
        Plot a GLM model's standardized coefficient magnitudes.

        :param num_of_features: the number of features shown in the plot.
        :param server: ?

        :returns: None.
        """
        assert_is_type(num_of_features, None, I(int, lambda x: x > 0))

        # check that model is a glm
        if self._model_json["algo"] != "glm":
            raise H2OValueError("This function is available for GLM models only")

        plt = _get_matplotlib_pyplot(server)
        if not plt: return

        # get unsorted tuple of labels and coefficients
        unsorted_norm_coef = self.coef_norm().items()
        # drop intercept value then sort tuples by the coefficient's absolute value
        drop_intercept = [tup for tup in unsorted_norm_coef if tup[0] != "Intercept"]
        norm_coef = sorted(drop_intercept, key=lambda x: abs(x[1]), reverse=True)

        signage = []
        for element in norm_coef:
            # if positive including zero, color blue, else color orange (use same colors as Flow)
            if element[1] >= 0:
                signage.append("#1F77B4")  # blue
            else:
                signage.append("#FF7F0E")  # dark orange

        # get feature labels and their corresponding magnitudes
        feature_labels = [tup[0] for tup in norm_coef]
        norm_coef_magn = [abs(tup[1]) for tup in norm_coef]
        # specify bar centers on the y axis, but flip the order so largest bar appears at top
        pos = range(len(feature_labels))[::-1]
        # specify the bar lengths
        val = norm_coef_magn

        # check number of features, default is all the features
        if num_of_features is None:
            num_of_features = len(val)

        # plot horizontal plot
        fig, ax = plt.subplots(1, 1, figsize=(14, 10))
        # create separate plot for the case where num_of_features = 1
        if num_of_features == 1:
            plt.barh(pos[0], val[0],
                     align="center", height=0.8, color=signage[0], edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks([0], feature_labels[0])
            ax.margins(y=0.5)

        else:
            plt.barh(pos[0:num_of_features], val[0:num_of_features],
                     align="center", height=0.8, color=signage[0:num_of_features], edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
            ax.margins(y=0.05)

        # generate custom fake lines that will be used as legend entries:
        # check if positive and negative values exist
        # if positive create positive legend
        if "#1F77B4" in signage[0:num_of_features] and "#FF7F0E" not in signage[0:num_of_features]:
            color_ids = {"Positive": "#1F77B4"}
            markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
                       for color in signage[0:num_of_features]]
            lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
            lgnd.legendHandles[0]._legmarker.set_markersize(10)
        # if neg create neg legend
        elif "#FF7F0E" in signage[0:num_of_features] and "#1F77B4" not in signage[0:num_of_features]:
            color_ids = {"Negative": "#FF7F0E"}
            markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
                       for color in set(signage[0:num_of_features])]
            lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
            lgnd.legendHandles[0]._legmarker.set_markersize(10)
        # if both provide both colors in legend
        else:
            color_ids = {"Positive": "#1F77B4", "Negative": "#FF7F0E"}
            markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
                       for color in set(signage[0:num_of_features])]
            lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
            lgnd.legendHandles[0]._legmarker.set_markersize(10)
            lgnd.legendHandles[1]._legmarker.set_markersize(10)

        # Hide the right and top spines, color others grey
        ax.spines["right"].set_visible(False)
        ax.spines["top"].set_visible(False)
        ax.spines["bottom"].set_color("#7B7B7B")
        ax.spines["left"].set_color("#7B7B7B")

        # Only show ticks on the left and bottom spines
        # ax.yaxis.set_ticks_position("left")
        # ax.xaxis.set_ticks_position("bottom")
        plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
        plt.tick_params(axis="x", which="minor", bottom="off", top="off",  labelbottom="off")
        plt.title("Standardized Coef. Magnitudes: H2O GLM", fontsize=20)
        # plt.axis("tight")
        # show plot
        if not server: plt.show()
示例#20
0
    def get_best_model(self, algorithm=None, criterion=None):
        """
        Get best model of a given family/algorithm for a given criterion from an AutoML object.

        :param algorithm: One of "basemodel", "deeplearning", "drf", "gbm", "glm", "stackedensemble", "xgboost".
                          If None, pick the best model regardless of the algorithm.
        :param criterion: Criterion can be one of the metrics reported in leaderboard. If set to None, the same ordering
                          as in the leaderboard will be used.
                          Avaliable criteria:
                            - Regression metrics: deviance, rmse, mse, mae, rmsle
                            - Binomial metrics: auc, logloss, aucpr, mean_per_class_error, rmse, mse
                            - Multinomial metrics: mean_per_class_error, logloss, rmse, mse
                          The following additional leaderboard information can be also used as a criterion:
                            - 'training_time_ms': column providing the training time of each model in milliseconds (doesn't include the training of cross validation models).
                            - 'predict_time_per_row_ms`: column providing the average prediction time by the model for a single row.
        :return: An H2OModel or None if no model of a given family is present
        
        :examples:
        
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        >>> gbm = aml.get_best_model("gbm")
        """
        from h2o.exceptions import H2OValueError

        def _get_models(leaderboard):
            return [
                m[0] for m in leaderboard["model_id"].as_data_frame(
                    use_pandas=False, header=False)
            ]

        higher_is_better = ["auc", "aucpr"]

        assert_is_type(algorithm, None, str)
        assert_is_type(criterion, None, str)

        if criterion is not None:
            criterion = criterion.lower()

        if "deviance" == criterion:
            criterion = "mean_residual_deviance"

        if algorithm is not None:
            if algorithm.lower() not in ("basemodel", "deeplearning", "drf",
                                         "gbm", "glm", "stackedensemble",
                                         "xgboost"):
                raise H2OValueError(
                    "Algorithm \"{}\" is not supported!".format(algorithm))
            algorithm = algorithm.lower()

        extra_cols = ["algo"]
        if criterion in ("training_time_ms", "predict_time_per_row_ms"):
            extra_cols.append(criterion)

        leaderboard = h2o.automl.get_leaderboard(self,
                                                 extra_columns=extra_cols)
        leaderboard = leaderboard if algorithm is None else (
            leaderboard[leaderboard["algo"].tolower() == algorithm, :]
            if algorithm != "basemodel" else
            leaderboard[leaderboard["algo"].tolower() != "stackedensemble", :])

        if leaderboard.nrow == 0:
            return None

        if criterion is None:
            return h2o.get_model(leaderboard[0, "model_id"])

        if criterion not in leaderboard.columns:
            raise H2OValueError(
                "Criterion \"{}\" is not present in the leaderboard!".format(
                    criterion))

        models_in_default_order = _get_models(leaderboard)
        sorted_lb = leaderboard.sort(by=criterion,
                                     ascending=criterion
                                     not in higher_is_better)
        selected_models = _get_models(
            sorted_lb[sorted_lb[criterion] == sorted_lb[0, criterion]])
        picked_model = [
            model for model in models_in_default_order
            if model in selected_models
        ][0]

        return h2o.get_model(picked_model)
示例#21
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              offset_column=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              **params):
        """
        Train the model synchronously (i.e. do not return until the model finishes training).

        To train asynchronously call :meth:`start`.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        """
        algo_params = locals()
        parms = self._parms.copy()
        parms.update({
            k: v
            for k, v in algo_params.items()
            if k not in ["self", "params", "algo_params", "parms"]
        })
        # dictionaries have special handling in grid search, avoid the implicit conversion
        parms[
            "search_criteria"] = None if self.search_criteria is None else str(
                self.search_criteria)
        parms["export_checkpoints_dir"] = self.export_checkpoints_dir
        parms["parallelism"] = self._parallelism
        parms["hyper_parameters"] = None if self.hyper_params is None else str(
            self.hyper_params)  # unique to grid search
        parms.update({
            k: v
            for k, v in list(self.model._parms.items()) if v is not None
        })  # unique to grid search
        parms.update(params)
        if '__class__' in parms:  # FIXME: hackt for PY3
            del parms['__class__']
        y = algo_params["y"]
        tframe = algo_params["training_frame"]
        if tframe is None: raise ValueError("Missing training_frame")
        if y is not None:
            if is_type(y, list, tuple):
                if len(y) == 1:
                    parms["y"] = y[0]
                else:
                    raise ValueError('y must be a single column reference')
        if x is None:
            if (isinstance(y, int)):
                xset = set(range(training_frame.ncols)) - {y}
            else:
                xset = set(training_frame.names) - {y}
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-training_frame.ncols <= xi <
                            training_frame.ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(training_frame.names[xi])
                else:
                    if xi not in training_frame.names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)
        parms["x"] = x
        self.build_model(parms)
示例#22
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        algo = self.algo
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"})
        ncols = training_frame.ncols
        names = training_frame.names
        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        elif y is not None:
            raise H2OValueError("y should not be provided for an unsupervised model")
        assert_is_type(y, str, None)
        ignored_columns_set = set()
        if ignored_columns is not None:
            if x is not None:
                raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
            for ic in ignored_columns:
                if is_type(ic, int):
                    if not (-ncols <= ic < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % ic)
                    ignored_columns_set.add(names[ic])
                else:
                    if ic not in names:
                        raise H2OValueError("Column %s not in the training frame" % ic)
                    ignored_columns_set.add(ic)
        if x is None:
            xset = set(names) - {y} - ignored_columns_set
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)

        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs
        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        parms["training_frame"] = training_frame
        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        offset = parms["offset_column"]
        folds = parms["fold_column"]
        weights = parms["weights_column"]
        ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
        parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
示例#23
0
    def train_segments(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
                       weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
                       segments=None, segment_models_id=None, parallelism=1, verbose=False):
        """
        Trains H2O model for each segment (subpopulation) of the training dataset.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for each model training. Use 0 to disable.
            Please note that regardless of how this parameter is set, a model will be built for each input segment.
            This parameter only affects individual model training.
        :param segments: A list of columns to segment-by. H2O will group the training (and validation) dataset
            by the segment-by columns and train a separate model for each segment (group of rows).
            As an alternative to providing a list of columns, users can also supply an explicit enumeration of
            segments to build the models for. This enumeration needs to be represented as H2OFrame.
        :param segment_models_id: Identifier for the returned collection of Segment Models. If not specified
            it will be automatically generated.
        :param parallelism: Level of parallelism of the bulk segment models building, it is the maximum number 
            of models each H2O node will be building in parallel.
        :param bool verbose: Enable to print additional information during model building. Defaults to False.

        :examples:

        >>> response = "survived"
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> titanic[response] = titanic[response].asfactor()
        >>> predictors = ["survived","name","sex","age","sibsp","parch","ticket","fare","cabin"]
        >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
        >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
        >>> titanic_gbm = H2OGradientBoostingEstimator(seed=1234)
        >>> titanic_models = titanic_gbm.train_segments(segments=["pclass"],
        ...                                             x=predictors,
        ...                                             y=response,
        ...                                             training_frame=train,
        ...                                             validation_frame=valid)
        >>> titanic_models.as_frame()
        """
        assert_is_type(segments, None, H2OFrame, [str])
        assert_is_type(verbose, bool)
        assert_is_type(segment_models_id, None, str)
        assert_is_type(parallelism, int)

        if segments is None:
            raise H2OValueError("Parameter segments was not specified. Please provide either a list of columns to "
                                "segment-by or an explicit list of segments to build models for.")

        parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column,
                                 fold_column=fold_column, weights_column=weights_column,
                                 validation_frame=validation_frame, max_runtime_secs=max_runtime_secs,
                                 ignored_columns=ignored_columns, model_id=None, verbose=verbose)

        if isinstance(segments, H2OFrame):
            parms["segments"] = H2OEstimator._keyify(segments)
        else:
            parms["segment_columns"] = segments
        if segment_models_id:
            parms["segment_models_id"] = segment_models_id
        parms["parallelism"] = parallelism

        rest_ver = self._get_rest_version(parms)
        train_segments_response = h2o.api("POST /%d/SegmentModelsBuilders/%s" % (rest_ver, self.algo), data=parms)
        job = H2OJob(train_segments_response, job_type=(self.algo + " Segment Models Build"))
        job.poll()
        return H2OSegmentModels(job.dest_key)
示例#24
0
 def _make_parms(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
                 weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
                 model_id=None, verbose=False, extend_parms_fn=None):
     has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
     training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                         required=self._options_.get('requires_training_frame', True) and not has_default_training_frame)
     validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
     assert_is_type(y, None, int, str)
     assert_is_type(x, None, int, str, [str, int], {str, int})
     assert_is_type(ignored_columns, None, [str, int], {str, int})
     assert_is_type(offset_column, None, int, str)
     assert_is_type(fold_column, None, int, str)
     assert_is_type(weights_column, None, int, str)
     assert_is_type(max_runtime_secs, None, numeric)
     assert_is_type(model_id, None, str)
     assert_is_type(verbose, bool)
     assert_is_type(extend_parms_fn, None, FunctionType)
 
     override_default_training_frame = training_frame is not None
     if not override_default_training_frame:
         self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
         training_frame = self.training_frame if has_default_training_frame else None
 
     if verbose and not self._options_.get('verbose', False):
         raise H2OValueError("Verbose mode is not available for %s" % self.__class__.__name__)
     parms = self._parms.copy()
     names = training_frame.names if training_frame is not None else []
     ncols = training_frame.ncols if training_frame is not None else 0
     types = training_frame.types if training_frame is not None else {}
 
     if self.supervised_learning:
         if y is None: y = "response"
         if is_type(y, int):
             if not (-ncols <= y < ncols):
                 raise H2OValueError("Column %d does not exist in the training frame" % y)
             y = names[y]
         else:
             if y not in names:
                 raise H2OValueError("Column %s does not exist in the training frame" % y)
         self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
     else:
         # If `y` is provided for an unsupervised model we'll simply ignore
         # it. This way an unsupervised model can be used as a step in
         # sklearn's pipeline.
         y = None
         self._estimator_type = "unsupervised"
 
     if override_default_training_frame:
         assert_is_type(y, str, None)
         ignored_columns_set = set()
         if ignored_columns is None and "ignored_columns" in parms:
             ignored_columns = parms['ignored_columns']
         if ignored_columns is not None:
             if x is not None:
                 raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
             for ic in ignored_columns:
                 if is_type(ic, int):
                     if not (-ncols <= ic < ncols):
                         raise H2OValueError("Column %d does not exist in the training frame" % ic)
                     ignored_columns_set.add(names[ic])
                 else:
                     if ic not in names:
                         raise H2OValueError("Column %s not in the training frame" % ic)
                     ignored_columns_set.add(ic)
         if x is None:
             xset = set(names) - {y} - ignored_columns_set
         else:
             xset = set()
             if is_type(x, int, str): x = [x]
             for xi in x:
                 if is_type(xi, int):
                     if not (-ncols <= xi < ncols):
                         raise H2OValueError("Column %d does not exist in the training frame" % xi)
                     xset.add(names[xi])
                 else:
                     if xi not in names:
                         raise H2OValueError("Column %s not in the training frame" % xi)
                     xset.add(xi)
         x = list(xset)
         self._check_and_save_parm(parms, "offset_column", offset_column)
         self._check_and_save_parm(parms, "weights_column", weights_column)
         self._check_and_save_parm(parms, "fold_column", fold_column)
 
     if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs
 
     # Overwrites the model_id parameter only if model_id is passed
     if model_id is not None:
         parms["model_id"] = model_id
     if override_default_training_frame:
         parms["training_frame"] = training_frame
         offset = parms["offset_column"]
         folds = parms["fold_column"]
         weights = parms["weights_column"]
 
     if validation_frame is not None:
         parms["validation_frame"] = validation_frame
 
     if is_type(y, int):
         y = names[y]
     if y is not None:
         parms["response_column"] = y
     if not isinstance(x, (list, tuple)):
         x = [x]
     if len(x) > 0 and is_type(x[0], int):
         x = [names[i] for i in x]
     if override_default_training_frame:
         ignored_columns = list(set(names) - set(x + [y, offset, folds, weights]))
         parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
     parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                              else [quoted(col) for col in parms["interactions"]])
     parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                   else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
 
     # internal hook allowing subclasses to extend train parms 
     if extend_parms_fn is not None:
         extend_parms_fn(parms)
 
     parms = {k: H2OEstimator._keyify(v) for k, v in parms.items()}
     if "r2" in (parms.get('stopping_metric') or []):
         raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
     return parms
示例#25
0
    def start(jar_path=None,
              nthreads=-1,
              enable_assertions=True,
              max_mem_size=None,
              min_mem_size=None,
              ice_root=None,
              log_dir=None,
              log_level=None,
              max_log_file_size=None,
              port="54321+",
              name=None,
              extra_classpath=None,
              verbose=True,
              jvm_custom_args=None,
              bind_to_localhost=True):
        """
        Start new H2O server on the local machine.

        :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the
            locations returned by `._jar_paths()`.
        :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used.
            -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly.
        :param enable_assertions: If True, pass `-ea` option to the JVM.
        :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes.
        :param min_mem_size: Minimum heap size (jvm option Xms), in bytes.
        :param log_dir: Directory for H2O logs to be stored if a new instance is started. Default directory is determined
            by H2O internally.
        :param log_level: The logger level for H2O if a new instance is started.
        :param max_log_file_size: Maximum size of INFO and DEBUG log files. The file is rolled over after a specified 
            size has been reached. (The default is 3MB. Minimum is 1MB and maximum is 99999MB)
        :param ice_root: A directory where H2O stores its temporary files. Default location is determined by
            tempfile.mkdtemp().
        :param port: Port where to start the new server. This could be either an integer, or a string of the form
            "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up.
        :param name: name of the h2o cluster to be started
        :param extra_classpath: List of paths to libraries that should be included on the Java classpath.
        :param verbose: If True, then connection info will be printed to the stdout.
        :param jvm_custom_args: Custom, user-defined arguments for the JVM H2O is instantiated in
        :param bind_to_localhost: A flag indicating whether access to the H2O instance should be restricted to the local
            machine (default) or if it can be reached from other computers on the network.
            Only applicable when H2O is started from the Python client.

        :returns: a new H2OLocalServer instance
        """
        assert_is_type(jar_path, None, str)
        assert_is_type(port, None, int, str)
        assert_is_type(name, None, str)
        assert_is_type(nthreads, -1, BoundInt(1, 4096))
        assert_is_type(enable_assertions, bool)
        assert_is_type(min_mem_size, None, int)
        assert_is_type(max_mem_size, None, BoundInt(1 << 25))
        assert_is_type(log_dir, str, None)
        assert_is_type(log_level, str, None)
        assert_satisfies(
            log_level, log_level
            in [None, "TRACE", "DEBUG", "INFO", "WARN", "ERRR", "FATA"])
        assert_is_type(max_log_file_size, str, None)
        assert_is_type(ice_root, None, I(str, os.path.isdir))
        assert_is_type(extra_classpath, None, [str])
        assert_is_type(jvm_custom_args, list, None)
        assert_is_type(bind_to_localhost, bool)
        if jar_path:
            assert_satisfies(jar_path, jar_path.endswith("h2o.jar"))

        if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size:
            raise H2OValueError(
                "`min_mem_size`=%d is larger than the `max_mem_size`=%d" %
                (min_mem_size, max_mem_size))
        if port is None: port = "54321+"
        baseport = None
        # TODO: get rid of this port gimmick and have 2 separate parameters.
        if is_type(port, str):
            if port.isdigit():
                port = int(port)
            else:
                if not (port[-1] == "+" and port[:-1].isdigit()):
                    raise H2OValueError(
                        "`port` should be of the form 'DDDD+', where D is a digit. Got: %s"
                        % port)
                baseport = int(port[:-1])
                port = 0

        hs = H2OLocalServer()
        hs._verbose = bool(verbose)
        hs._jar_path = hs._find_jar(jar_path)
        hs._extra_classpath = extra_classpath
        hs._ice_root = ice_root
        hs._name = name
        if not ice_root:
            hs._ice_root = tempfile.mkdtemp()
            hs._tempdir = hs._ice_root

        if verbose: print("Attempting to start a local H2O server...")
        hs._launch_server(port=port,
                          baseport=baseport,
                          nthreads=int(nthreads),
                          ea=enable_assertions,
                          mmax=max_mem_size,
                          mmin=min_mem_size,
                          jvm_custom_args=jvm_custom_args,
                          bind_to_localhost=bind_to_localhost,
                          log_dir=log_dir,
                          log_level=log_level,
                          max_log_file_size=max_log_file_size)
        if verbose:
            print("  Server is running at %s://%s:%d" %
                  (hs.scheme, hs.ip, hs.port))
        atexit.register(lambda: hs.shutdown())
        return hs
示例#26
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        :param bool verbose: Print scoring history to stdout. Defaults to False.
        """

        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)

            parms["offset_column"] = offset_column
            parms["fold_column"] = fold_column
            parms["weights_column"] = weights_column

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])

        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
示例#27
0
    def _process_response(response, save_to):
        """
        Given a response object, prepare it to be handed over to the external caller.

        Preparation steps include:
           * detect if the response has error status, and convert it to an appropriate exception;
           * detect Content-Type, and based on that either parse the response as JSON or return as plain text.
        """
        status_code = response.status_code
        if status_code == 200 and save_to:
            if save_to.startswith("~"): save_to = os.path.expanduser(save_to)
            if os.path.isdir(save_to) or save_to.endswith(os.path.sep):
                dirname = os.path.join(os.path.abspath(save_to), '')
                filename = H2OConnection._find_file_name(response)
            else:
                dirname, filename = os.path.split(os.path.abspath(save_to))
            fullname = os.path.join(dirname, filename)
            try:
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(fullname, "wb") as f:
                    for chunk in response.iter_content(chunk_size=65536):
                        if chunk:  # Empty chunks may occasionally happen
                            f.write(chunk)
            except OSError as e:
                raise H2OValueError("Cannot write to file %s: %s" %
                                    (fullname, e))
            return fullname

        content_type = response.headers.get("Content-Type", "")
        if ";" in content_type:  # Remove a ";charset=..." part
            content_type = content_type[:content_type.index(";")]

        # this is needed so that response.text() works correctly
        response.encoding = response.headers.get("Character-Encoding",
                                                 response.encoding)

        # Auto-detect response type by its content-type. Decode JSON, all other responses pass as-is.
        if content_type == "application/json":
            try:
                data = response.json(object_pairs_hook=H2OResponse)
            except (JSONDecodeError,
                    requests.exceptions.ContentDecodingError) as e:
                raise H2OServerError("Malformed JSON from server (%s):\n%s" %
                                     (str(e), response.text))
        else:
            data = response.text

        # Success (200 = "Ok", 201 = "Created", 202 = "Accepted", 204 = "No Content")
        if status_code in {200, 201, 202, 204}:
            return data

        # Client errors (400 = "Bad Request", 404 = "Not Found", 412 = "Precondition Failed")
        if status_code in {400, 404, 412} and isinstance(data, H2OErrorV3):
            data.show_stacktrace = False
            raise H2OResponseError(data)

        # Server errors (notably 500 = "Server Error")
        # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server
        # did not provide the correct status code.
        raise H2OServerError("HTTP %d %s:\n%s" %
                             (status_code, response.reason, data))
示例#28
0
    def train(self, x = None, y = None, training_frame = None, fold_column = None, 
              weights_column = None, validation_frame = None, leaderboard_frame = None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data to be scored on while training. Optional. 
            This frame is used early stopping of individual models and early stopping of the grid searches 
            (unless max_models or max_runtime_secs overrides metric-based early stopping).
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        ncols = training_frame.ncols
        names = training_frame.names

        #Set project name if None
        if self.project_name is None:
            self.project_name = "automl_" + training_frame.frame_id
            self.build_control["project_name"] = self.project_name

        # Minimal required arguments are training_frame and y (response)
        if y is None:
            raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        if training_frame is None:
            raise ValueError('The training frame is not set!')
        else:
            assert_is_type(training_frame, H2OFrame)
            input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None:
                ignored_columns.remove(fold_column)
            if weights_column is not None:
                ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec = input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control
        automl_build_params['build_models']  = self.build_models

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        self._job = H2OJob(resp['job'], "AutoML")
        self._job.poll()
        self._fetch()
示例#29
0
 def _verify_training_frame_params(self, *args):
     for param in args:
         if param is not None:
             raise H2OValueError("No training frame defined, yet the parameter %d is has been specified.", param)
示例#30
0
    def train(self, x=None, y=None, training_frame=None):
        """
        Train the rulefit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """

        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                family = "multinomial"
                raise H2OValueError("multinomial use cases not yet supported")
            else:
                family = "binomial"
        else:
            if self.glm_params.get("family") is not None:
                family = self.glm_params.get("family")
                self.glm_params.pop("family")
            else:
                family = "gaussian"

        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_rule_len, self.max_rule_len + 1)
        tree_models = dict()
        for model_idx in range(len(depths)):

            # Train tree models
            tree_model = _tree_model(self.algorithm, depths[model_idx],
                                     self.seed, model_idx, self.tree_params)
            tree_model.train(y=y, x=x, training_frame=training_frame)
            tree_models[model_idx] = tree_model

            paths = tree_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = [
                "tree_{0}.{1}".format(str(model_idx), x)
                for x in paths.col_names
            ]
            paths_frame = paths_frame.cbind(paths)

        if self.max_num_rules:
            # Train GLM with chosen lambda
            glm = H2OGeneralizedLinearEstimator(
                model_id="glm.hex",
                seed=self.seed,
                family=family,
                alpha=1,
                max_active_predictors=self.max_num_rules + 1,
                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

        else:
            # Get optimal lambda
            glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                                nfolds=self.nfolds,
                                                seed=self.seed,
                                                family=family,
                                                alpha=1,
                                                lambda_search=True,
                                                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

            lambda_ = _get_glm_lambda(glm)

            # Train GLM with chosen lambda
            glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                                seed=self.seed,
                                                family=family,
                                                alpha=1,
                                                lambda_=lambda_,
                                                solver="COORDINATE_DESCENT",
                                                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

        # Get Intercept
        intercept = _get_intercept(glm)

        # Get Rules
        rule_importance = _get_rules(glm, tree_models, self.algorithm)

        self.intercept = intercept
        self.rule_importance = rule_importance
        self.glm = glm
        self.tree_models = tree_models